activity <- read.table(unz("activity.zip", "activity.csv"), header = TRUE, sep = ",")
activity$date <- as.Date(activity$date, "%Y-%m-%d")
stepsums <- aggregate(activity$steps, by = list(activity$date), FUN = sum)
stepsums$date <- as.Date(stepsums$Group.1, "%Y-%m-%d")
library(ggplot2)
ggplot(stepsums, aes(x=date, y=x)) + geom_histogram(stat="identity") + scale_x_date(date_breaks="1 day", date_labels = "%Y-%m-%d", expand = c(0,0)) + ylab("Steps") + xlab("Date") + theme(axis.text.x = element_text(angle = 90))
stepmean <- round(mean(stepsums$x, na.rm = TRUE), 2)
stepmedian <- median(stepsums$x, na.rm = TRUE)
The mean number of steps is 10766 and the median is 10765
dailymean <- with(stepsums, tapply(x, date, mean))
plot(as.Date(names(dailymean), "%Y-%m-%d"), dailymean, type = "l", lwd = 2,
col = "red", ylab = "Average steps per day", xlab = "Date")
library(data.table)
library(dplyr)
activity <- as.data.table(activity)
maxsteps <- arrange(activity, desc(steps))
Interval 615 is the 5-minute interval with the maximum number of steps.
806 steps were recorded in this interval.
activity2 <- activity
missingNA <- sum(is.na(activity2$steps))
There are 2304 NA values
NAperc <- round(sum(is.na(activity2$steps))/nrow(activity2)*100, 2)
NA values account for 13.11% of the values in the steps column, and imputing the missing values is therefore not advised (more than 5% is missing).
The attempt is made nonetheless:
Package used: Multivariate Imputation by Chained Equations (MICE)
Number of multiple imputations: 1
Method of imputation: PMM: Predictive mean matching
Number of iterations: 10
Randomization seed: 123
library(mice)
tempdf <- data.frame(activity$interval, activity$steps)
genData <- mice(tempdf, m = 1, maxit = 10, method = "pmm", seed = 123)
tempdf2 <- complete(genData,1)
activity2$steps <- tempdf2$activity.steps
NA values now after imputing data with MICE package:
sum(is.na(activity2$steps))
## [1] 0
stepsums2 <- aggregate(activity2$steps, by = list(activity2$date), FUN = sum)
stepsums2$date <- as.Date(stepsums2$Group.1, "%Y-%m-%d")
library(ggplot2)
ggplot(stepsums2, aes(x=date, y=x)) + geom_histogram(stat="identity") + scale_x_date(date_breaks="1 day", date_labels = "%Y-%m-%d", expand = c(0,0)) + ylab("Steps") + xlab("Date") + theme(axis.text.x = element_text(angle = 90))
stepmean2 <- round(mean(stepsums2$x, na.rm = TRUE), 2)
stepmedian2 <- median(stepsums2$x, na.rm = TRUE)
if (stepmean != stepmean2) { DifferenceMean <- "Yes"} else { DifferenceMean <- "No"}
if (stepmedian != stepmedian2) { DifferenceMed <- "Yes"} else { DifferenceMed <- "No"}
The newly calculated mean steps is 11397
The newly calculated median of steps is 11458
Did imputing the missing step values make a difference in the MEAN? Yes (11397 vs 10766)
Did imputing the missing step values make a difference in the MEDIAN? Yes
What is the impact of imputing missing data on the estimates of the total daily number of steps?
The mean has increased, and more information can now be seen in the histogram
library(chron)
activity2$day[is.weekend(activity2$date)] <- "Weekend"
activity2$day[!is.weekend(activity2$date)] <- "Weekday"
activity2$day <- as.factor(activity2$day)
library(lattice)
wdaggr <- aggregate(activity2$steps, by = list(activity2$interval, activity2$day), FUN = mean)
xyplot(wdaggr$x ~ wdaggr$Group.1 | wdaggr$Group.2, type = "l", layout = c(1,2), xlab = "Interval", ylab = "Steps")