This document presents the results of the Peer Assessment - 1 of Reproducible Research (the online offering of Johns Hopkins University) in a report using a single R markdown document.
This assignment makes use of data from a personal activity monitoring device. This device collects data at 5 minute intervals through out the day. The data consists of two months of data from an anonymous individual collected during the months of October and November, 2012 and include the number of steps taken in 5 minute intervals each day.
Data
The data for this assignment is downloaded from the course web site:
Dataset: Activity monitoring data [52K] The variables included in this dataset are:
steps: Number of steps taking in a 5-minute interval (missing values are coded as NA)
date: The date on which the measurement was taken in YYYY-MM-DD format
interval: Identifier for the 5-minute interval in which measurement was taken
The dataset is stored in a comma-separated-value (CSV) file and there are a total of 17,568 observations in this dataset.
Assignment
This assignment will be described in multiple parts but ultimately, need to be completed in a single R markdown document that can be processed by knitr and be transformed into an HTML file.
Always echo = TRUE is used so that someone else will be able to read the code.
echo = TRUE
library(ggplot2)
Loading and preprocessing the data
Show any code that is needed to
data <- read.csv("activity.csv", header = TRUE, na.strings = "NA",
colClasses = c("numeric", "character", "numeric"))
data$interval <- factor(data$interval)
data$date <- as.Date(data$date, format = "%Y-%m-%d")
head(data)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
data$interval <- factor(data$interval)
data$date <- as.Date(data$date, format = "%Y-%m-%d")
str(data)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : num NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: Factor w/ 288 levels "0","5","10","15",..: 1 2 3 4 5 6 7 8 9 10 ...
Summary
summary(data)
## steps date interval
## Min. : 0.0 Min. :2012-10-01 0 : 61
## 1st Qu.: 0.0 1st Qu.:2012-10-16 5 : 61
## Median : 0.0 Median :2012-10-31 10 : 61
## Mean : 37.4 Mean :2012-10-31 15 : 61
## 3rd Qu.: 12.0 3rd Qu.:2012-11-15 20 : 61
## Max. :806.0 Max. :2012-11-30 25 : 61
## NA's :2304 (Other):17202
What is mean total number of steps taken per day?
For this part of the assignment, missing values in the dataset are ignored.
steps_taken_per_day <- aggregate(steps ~ date, data, sum)
colnames(steps_taken_per_day) <- c("date", "steps")
ggplot(steps_taken_per_day, aes(x = steps)) + geom_histogram(fill = "darkblue",
binwidth = 1000) + labs(title = "Total Steps Taken Each Day", x = "Number of steps taken each Day", y = "Number of times (Count)") +
theme_bw()
mean_steps = mean(steps_taken_per_day$steps, na.rm = TRUE)
median_steps = median(steps_taken_per_day$steps, na.rm = TRUE)
mean_steps
## [1] 10766
median_steps
## [1] 10765
What is the average daily activity pattern?
steps_per_interval <- aggregate(data$steps, by = list(interval = data$interval),
FUN = mean, na.rm = TRUE)
# Convert to integers for plotting
steps_per_interval$interval <- as.integer(levels(steps_per_interval$interval)
[steps_per_interval$interval])
colnames(steps_per_interval) <- c("interval", "steps")
# Time series generation
ggplot(steps_per_interval, aes(x = interval, y = steps)) +
geom_line(color = "darkblue", size = 1) +
labs(title = "Average Daily Activity Pattern", x = "5-minute Interval",
y = "Average number of steps taken") + theme_bw() +
theme(legend.position = "bottom")
max_step_interval <- steps_per_interval[which.max(steps_per_interval$steps),
]$interval
max_step_interval
## [1] 835
On average, the 835th 5-minute interval contains the maximum number of steps.
Imputing missing values
Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.
fill_na <- function(data, defaults) {
na_indices <- which(is.na(data$steps))
na_replacements <- unlist(lapply(na_indices, FUN = function(idx) {
interval = data[idx, ]$interval
defaults[defaults$interval == interval, ]$steps
}))
fill_steps <- data$steps
fill_steps[na_indices] <- na_replacements
fill_steps
}
data_fill <- data.frame(steps = fill_na(data, steps_per_interval),
date = data$date, interval = data$interval)
Histogram of the total number of steps taken each day.
full_steps_per_day <- aggregate(steps ~ date, data_fill, sum)
colnames(full_steps_per_day) <- c("date", "steps")
ggplot(full_steps_per_day, aes(x = steps)) + geom_histogram(fill = "darkblue",
binwidth = 1000) + labs(title = "Histogram of Full Steps Taken per Day",
x = "Number of Steps after populate missing values", y = "Count") + theme_bw()
full_mean_steps = mean(full_steps_per_day$steps)
full_median_steps = median(full_steps_per_day$steps)
full_mean_steps
## [1] 10766
full_median_steps
## [1] 10766
Are there differences in activity patterns between weekdays and weekends?
For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.
weekdays_steps <- function(data) {
weekdays_steps <- aggregate(data$steps, by = list(interval = data$interval),
FUN = mean, na.rm = T)
# Convert to integers for plotting
weekdays_steps$interval <- as.integer(levels(weekdays_steps$interval)[weekdays_steps$interval])
colnames(weekdays_steps) <- c("interval", "steps")
weekdays_steps
}
data_by_weekdays <- function(data) {
data$weekday <- as.factor(weekdays(data$date))
weekend_data <- subset(data, weekday %in% c("Saturday", "Sunday"))
weekday_data <- subset(data, !weekday %in% c("Saturday", "Sunday"))
weekend_steps <- weekdays_steps(weekend_data)
weekday_steps <- weekdays_steps(weekday_data)
weekend_steps$dayofweek <- rep("weekend", nrow(weekend_steps))
weekday_steps$dayofweek <- rep("weekday", nrow(weekday_steps))
data_by_weekdays <- rbind(weekend_steps, weekday_steps)
data_by_weekdays$dayofweek <- as.factor(data_by_weekdays$dayofweek)
data_by_weekdays
}
data_weekdays <- data_by_weekdays(data_fill)
Your plot will look different from the one above because you will be using the activity monitor data. Note that the above plot was made using the lattice system but you can make the same version of the plot using any plotting system you choose.
ggplot(data_weekdays, aes(x = interval, y = steps)) + geom_line(color = "red",
size = 1) + facet_wrap(~dayofweek, nrow = 2, ncol = 1) +
labs(x = "Interval", y = "Number of steps") + theme_bw()