knitr::opts_chunk$set(warning=FALSE, fig.height=5, fig.width=8, cache=TRUE)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
activity <- read.csv("activity.csv")
activity$date <- as.Date(activity$date, "%Y-%m-%d")
summary(activity)
## steps date interval
## Min. : 0.00 Min. :2012-10-01 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:2012-10-16 1st Qu.: 588.8
## Median : 0.00 Median :2012-10-31 Median :1177.5
## Mean : 37.38 Mean :2012-10-31 Mean :1177.5
## 3rd Qu.: 12.00 3rd Qu.:2012-11-15 3rd Qu.:1766.2
## Max. :806.00 Max. :2012-11-30 Max. :2355.0
## NA's :2304
Below is a histogram showing the total number of steps taken per day:
tot_steps <- with(activity, aggregate(x=steps, by=list(date), FUN=sum, na.rm=TRUE))
names(tot_steps) <- c("date", "steps")
hist(x=tot_steps$steps, breaks=seq(0, 25000, by=1000), main="Histogram of total number of steps taken per day", xlab="Total number of steps taken per day")
The mean of the total number of steps taken per day is
mean(tot_steps$steps)
## [1] 9354.23
while the median is
median(tot_steps$steps)
## [1] 10395
Below is a time series plot of the 5-minute interval and the average number of steps taken, averaged across all days:
avg_act <- with(activity, aggregate(x=steps, by=list(interval), FUN=mean, na.rm=TRUE))
names(avg_act) <- c("interval", "mean_step_num")
plot(x=avg_act$interval, y=avg_act$mean_step_num, type="l", main="Number of steps taken on average across the duration", xlab="Interval", ylab="Average number of steps", lwd = 1.5)
The 5-minute interval that contains the maximum number of steps on average is
avg_act[which.max(avg_act$mean_step_num), ]$interval
## [1] 835
The total number of missing values in the dataset is
sum(is.na(activity$steps))
## [1] 2304
The missing values are filled in with the mean number of steps of the interval they belong to:
filled_na <- avg_act$mean_step_num[match(activity$interval, avg_act$interval)]
Add in the imputed values to create a new dataset:
new_activity <- transform(activity, steps=ifelse(is.na(activity$steps), yes=filled_na, no=activity$steps))
new_tot_steps <- aggregate(new_activity$steps~new_activity$date, FUN=sum)
names(new_tot_steps) <- c("date", "steps")
The histogram of total number of steps taken per day is updated to be compared with the previous graph:
hist(new_tot_steps$steps, breaks=seq(0, 25000, by=1000), main="Histogram of total number of steps taken per day", xlab="Total number of steps taken per day", ylim=c(0, 20))
The new mean of the total number of steps taken per day is
mean(new_tot_steps$steps)
## [1] 10766.19
while the new median is
median(new_tot_steps$steps)
## [1] 10766.19
The values are different from what were seen previously. The imputation of missing data has increased the estimates of the total daily number of steps.
A new factor variable ‘day’ is created to indicate if a given day is a weekday or weekend:
new_activity <- mutate(new_activity, day=weekdays(new_activity$date))
weekdays_list <- c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')
new_activity$day <- factor((weekdays(new_activity$date) %in% weekdays_list),
levels=c(TRUE, FALSE), labels=c('Weekdays', 'Weekends'))
To show the differences in activity patterns between weekdays and weekends, a panel of time series plots of the time interval and average number of steps taken, averaged across all weekdays or weekends, is created as below:
act_days <- aggregate(steps~interval + day, data=new_activity, FUN=mean)
ggplot(data=act_days, aes(x=interval, y=steps, color=day)) +
geom_line() +
labs(title="Average daily steps by type of date", x="Interval", y="Average number of steps") +
facet_wrap(facets=~day, nrow=2, ncol=1)