library(knitr)
## Warning: package 'knitr' was built under R version 3.2.2
opts_chunk$set(echo=TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
activityData <- read.csv('./data/activity.csv', sep=',', header=TRUE)
activityData$date <- as.Date(activityData$date) #change to date format
stepsPerDay <- group_by(activityData, date) %>%
filter(!is.na(steps)) %>%
summarize(stepSum=sum(steps)) %>%
print
## Source: local data frame [53 x 2]
##
## date stepSum
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
## 7 2012-10-09 12811
## 8 2012-10-10 9900
## 9 2012-10-11 10304
## 10 2012-10-12 17382
## .. ... ...
hist(stepsPerDay$stepSum, col='red', breaks=10, xlab='Steps Per Day', main='Histogram of Total Number of Steps per Day')
median(stepsPerDay$stepSum)
## [1] 10765
mean(stepsPerDay$stepSum)
## [1] 10766.19
stepsPerInterval <- group_by(activityData, interval) %>%
filter(!is.na(steps)) %>%
summarize(Steps=mean(steps)) %>%
print
## Source: local data frame [288 x 2]
##
## interval Steps
## 1 0 1.7169811
## 2 5 0.3396226
## 3 10 0.1320755
## 4 15 0.1509434
## 5 20 0.0754717
## 6 25 2.0943396
## 7 30 0.5283019
## 8 35 0.8679245
## 9 40 0.0000000
## 10 45 1.4716981
## .. ... ...
Time Series plot
plot(stepsPerInterval$interval, main='Average Daily Activity Pattern',stepsPerInterval$Steps, typ='l',
xlab='5-Minute Interval', ylab='Average Number of Steps')
stepsPerInterval[stepsPerInterval$Steps == (max(stepsPerInterval$Steps)),]
## Source: local data frame [1 x 2]
##
## interval Steps
## 1 835 206.1698
sum(rowSums(is.na(activityData)))
## [1] 2304
naData <- is.na(activityData$steps)
meanInterval <- tapply(activityData$steps, activityData$interval, mean, na.rm=TRUE)
newDataset <- activityData
newDataset$steps[naData] <- meanInterval[as.character(newDataset$interval[naData])]
Check for missing values
any(is.na(newDataset$steps))
## [1] FALSE
perDay <- tapply(newDataset$steps, newDataset$date, sum)
hist(perDay, col='red', xlab='Steps Per Day', main='Histogram: Total Number of Steps per day without NAs')
median(perDay)
## [1] 10766.19
mean(perDay)
## [1] 10766.19
Although the the mean and median were approximately equal to begin with (data was normal and the distribution was approximately symetrical), the result of imputing missing data on the estimtes does have an impact, in so far as the mean and median do converge after imputing.
x <- newDataset # make a copy of newDataset
x$day <- as.factor(weekdays(x$date)) #change date to weekdays
#change factor levels to Weekend & Weekday
levels(x$day)[levels(x$day)=='Saturday'] <- 'Weekend'
levels(x$day)[levels(x$day)=='Sunday'] <- 'Weekend'
levels(x$day)[levels(x$day)=='Monday'] <- 'Weekday'
levels(x$day)[levels(x$day)=='Tuesday'] <- 'Weekday'
levels(x$day)[levels(x$day)=='Wednesday'] <- 'Weekday'
levels(x$day)[levels(x$day)=='Thursday'] <- 'Weekday'
levels(x$day)[levels(x$day)=='Friday'] <- 'Weekday'
First calculate the average number of steps taken averaged across all weekday days and weekend days.
newStepsInterval <- group_by(x, day, interval) %>%
#filter(!is.na(steps)) %>%
summarize(Steps=mean(steps)) %>%
print
## Source: local data frame [576 x 3]
## Groups: day
##
## day interval Steps
## 1 Weekday 0 2.25115304
## 2 Weekday 5 0.44528302
## 3 Weekday 10 0.17316562
## 4 Weekday 15 0.19790356
## 5 Weekday 20 0.09895178
## 6 Weekday 25 1.59035639
## 7 Weekday 30 0.69266247
## 8 Weekday 35 1.13794549
## 9 Weekday 40 0.00000000
## 10 Weekday 45 1.79622642
## .. ... ... ...
Plot of Weekend days versus weekday days.
par(mfrow = c(2,1))
with(subset(newStepsInterval, day=='Weekend'), plot(interval, Steps, ylab='Number of Steps', main='Weekend', type='l'))
with(subset(newStepsInterval, day=='Weekday'), plot(interval, Steps, ylab='Number of Steps', main='Weekday', type='l'))