This report makes use of data from a personal activity monitoring device. This device collects data at 5 minute intervals through out the day. The data consists of two months of data from an anonymous individual collected during the months of October and November, 2012 and include the number of steps taken in 5 minute intervals each day.
Dataset: Activity monitoring data, 52K
temp <- unz("activity.zip", "activity.csv")
data <- read.csv(temp)
hours <- substr(sprintf("%04d",data$interval), 0, 2)
minutes <- substr(sprintf("%04d",data$interval), 3, 4)
data$time_str <- paste(hours, minutes, sep = ":")
data$time <- as.POSIXct(data$time_str, format="%H:%M")
hours <- substr(sprintf("%04d",data$interval), 0, 2)
minutes <- substr(sprintf("%04d",data$interval), 3, 4)
data$time_str <- paste(hours, minutes, sep = ":")
data$time <- as.POSIXct(data$time_str, format="%H:%M")
total.steps.day <- aggregate(steps~date, data, sum)
hist(total.steps.day$steps, main = "Histogram of steps per day", xlab = "Steps per day")
mean.total.steps.day <- mean(total.steps.day$steps)
median.total.steps.day <- median(total.steps.day$steps)
For total number of steps taken per day, mean = 10766.19, median = 10765.00
mean.steps.interval <- aggregate(steps~interval+time+time_str, data, mean)
plot(mean.steps.interval$time, mean.steps.interval$steps, type="l",
main = "Average steps by day time", ylab="Number of steps", xlab = "Day time", xaxt="n")
max.steps <- max(mean.steps.interval$steps)
max.steps.intervals.df <- mean.steps.interval[mean.steps.interval$steps == max.steps, ]
max.steps.intervals.arr <- max.steps.intervals.df$time
for (t in 1:length(max.steps.intervals.arr)) {
abline(v = max.steps.intervals.arr[t], col = "red", lwd = 1)
}
cnt <- length(mean.steps.interval$time)
tm <- mean.steps.interval$time[seq(1,cnt, 36)]
tm <- union(tm, max.steps.intervals.arr)
tm <- tm[order(tm)]
lb <- mean.steps.interval[mean.steps.interval$time %in% tm,]$time_str
axis(labels = lb, side=1, at = tm)
legend(x = "topright", c("Maximum avg steps"), col = c("red"), lwd = c(1))
5-minute intervals, on average across all the days in the dataset, which contains the maximum number of steps:
| time | steps |
|---|---|
| 08:35 | 206.1698 |
na.total <- sum(is.na(data))
Total number of missing values in the dataset: 2304
Let’s combine both proposed strategies by calculating average of day mean and interval mean for each day interval
data.m <- data
data.m$steps.m.d <- with(data.m, ave(steps, date, FUN = function(x) mean(na.omit(x))))
steps.m.d.na <- is.na(data.m$steps.m.d)
if(sum(steps.m.d.na) > 0){
data.m[steps.m.d.na,]$steps.m.d <- 0
}
data.m$steps.m.i <- with(data.m, ave(steps, interval, FUN = function(x) mean(na.omit(x))))
steps.m.i.na <- is.na(data.m$steps.m.i)
if(sum(steps.m.i.na) > 0){
data.m[steps.m.i.na,]$steps.m.i <- 0
}
data.m$steps.m <- rowMeans(data.m[,c("steps.m.d", "steps.m.i")])
data.c <- data
na.indexes <- which(is.na(data$steps))
data.c[na.indexes,]$steps <- data.m[na.indexes,]$steps.m
total.steps.day.c <- aggregate(steps~date, data.c, sum)
hist(total.steps.day.c$steps, main = "Histogram of steps per day with imputed NA", xlab = "Steps per day")
mean.total.steps.day.c <- mean(total.steps.day.c$steps)
median.total.steps.day.c <- median(total.steps.day.c$steps)
For total number of steps taken per day of data with imputed NA values, mean = 10060.21, median = 10395.00.
Difference with appropriate values in original data, mean diff = -705.98, median diff = -370.00.
data.c$date.date <- as.Date(data.c$date)
week.day.names <- c('Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday')
wd.factor.labels <- c('weekend', 'weekday')
wd.factor.values <- (weekdays(data.c$date.date) %in% week.day.names)
data.c$wd <- factor(wd.factor.values, levels=c(FALSE, TRUE), labels=wd.factor.labels)
mean.steps.interval.wd <- aggregate(steps~interval+wd+time+time_str, data.c, mean)
mean.steps.interval.wd.week <- mean.steps.interval.wd[mean.steps.interval.wd$wd == 'weekday',]
mean.steps.interval.wd.end <- mean.steps.interval.wd[mean.steps.interval.wd$wd == "weekend",]
tm <- mean.steps.interval$time[seq(1,cnt, 36)]
lb <- mean.steps.interval[mean.steps.interval$time %in% tm,]$time_str
par(mfrow=c(2, 1), oma = c(5,5,5,5), mar = c(0,0,0,0))
max.steps.week <- max(mean.steps.interval.wd.week$steps)
ylim.week <- max.steps.week*1.1
plot(mean.steps.interval.wd.week$time, mean.steps.interval.wd.week$steps,
type="l", xaxt="n", yaxt="n", ylim = c(0, ylim.week))
axis(side=2, labels = F)
axis(side=3, at = tm, labels = F)
axis(side=4)
mtext("weekday", line = -1, side=3)
max.steps.end <- max(mean.steps.interval.wd.end$steps)
ylim.end <- max.steps.end*1.1
plot(mean.steps.interval.wd.end$time, mean.steps.interval.wd.end$steps,
type="l", xaxt="n", yaxt="n", ylim = c(0, ylim.end))
axis(side=2)
axis(side=4, labels = F)
axis(labels = lb, side=1, at = tm)
mtext("weekend", line = -1, side=3)
mtext("Day time", line = 3, side=1 , outer = T)
mtext("Number of steps", line = 3, side=2 , outer = T)
mtext("Average steps by day time during weekday/weekend", line = 2, side=3 , outer = T, cex = 1.5)