’’’ This assignment makes use of data from a personal activity monitoring device. This device collects data at 5 minute intervals through out the day. The data consists of two months of data from an anonymous individual collected during the months of October and November, 2012 and include the number of steps taken in 5 minute intervals each day.
The variables included in this dataset are:
steps: Number of steps taking in a 5-minute interval (missing values are coded as NA)
date: The date on which the measurement was taken in YYYY-MM-DD format
interval: Identifier for the 5-minute interval in which measurement was taken
’’’ Loading required libraries
library(knitr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Question 1: Code for reading in the dataset and/or processing the data
data <- read.csv("activity.csv", header = TRUE, sep = ',', colClasses = c("numeric", "character",
"integer"))
data$date <- ymd(data$date)
summary(data)
## steps date interval
## Min. : 0.00 Min. :2012-10-01 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:2012-10-16 1st Qu.: 588.8
## Median : 0.00 Median :2012-10-31 Median :1177.5
## Mean : 37.38 Mean :2012-10-31 Mean :1177.5
## 3rd Qu.: 12.00 3rd Qu.:2012-11-15 3rd Qu.:1766.2
## Max. :806.00 Max. :2012-11-30 Max. :2355.0
## NA's :2304
Question 2: What is mean total number of steps taken per day?
steps <- data %>%
filter(!is.na(steps)) %>%
group_by(date) %>%
summarize(steps = sum(steps))
print(steps)
## # A tibble: 53 x 2
## date steps
## <date> <dbl>
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
## 7 2012-10-09 12811
## 8 2012-10-10 9900
## 9 2012-10-11 10304
## 10 2012-10-12 17382
## # ... with 43 more rows
Question 3: Histogram of the total number of steps taken each day
ggplot(steps, aes(x = steps)) +
geom_histogram(fill = "coral", binwidth = 1000) +
labs(title = "Total Steps per day", x = "Steps per day", y = "Frequency")
Question 4: Mean and median number of steps taken each day
print(mean_steps <- mean(steps$steps, na.rm = TRUE))
## [1] 10766.19
print(median_steps <- median(steps$steps, na.rm = TRUE))
## [1] 10765
Question 5: Time series plot of the average number of steps taken
interval <- data %>%
filter(!is.na(steps)) %>%
group_by(interval) %>%
summarize(steps = mean(steps))
interval
## # A tibble: 288 x 2
## interval steps
## <int> <dbl>
## 1 0 1.72
## 2 5 0.340
## 3 10 0.132
## 4 15 0.151
## 5 20 0.0755
## 6 25 2.09
## 7 30 0.528
## 8 35 0.868
## 9 40 0
## 10 45 1.47
## # ... with 278 more rows
ggplot(interval, aes(x=interval, y=steps)) +
geom_line(color = "darkmagenta")
Question 6: The 5-minute interval that, on average, contains the maximum number of steps
interval[which.max(interval$steps),]
## # A tibble: 1 x 2
## interval steps
## <int> <dbl>
## 1 835 206.
Question 7: Code to describe and show a strategy for imputing missing data
data_full <- data
nas <- is.na(data_full$steps)
avg_interval <- tapply(data_full$steps, data_full$interval, mean, na.rm=TRUE, simplify=TRUE)
data_full$steps[nas] <- avg_interval[as.character(data_full$interval[nas])]
steps_full <- data_full %>%
filter(!is.na(steps)) %>%
group_by(date) %>%
summarize(steps = sum(steps))
steps_full
## # A tibble: 61 x 2
## date steps
## <date> <dbl>
## 1 2012-10-01 10766.
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 10766.
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # ... with 51 more rows
Question 8: Histogram of the total number of steps taken each day after missing values are imputed
ggplot(steps_full, aes(x = steps)) +
geom_histogram(fill = "cornflowerblue", binwidth = 1000) +
labs(title = "Total number of steps taken each day after missing values are imputed", x = "Steps per day", y = "Frequency")
Question 9: Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends
mean_steps_full <- mean(steps_full$steps, na.rm = TRUE)
median_steps_full <- median(steps_full$steps, na.rm = TRUE)
data_full <- mutate(data_full, weektype = ifelse(weekdays(data_full$date) == "Saturday" | weekdays(data_full$date) == "Sunday", "weekend", "weekday"))
data_full$weektype <- as.factor(data_full$weektype)
head(data_full)
## steps date interval weektype
## 1 1.7169811 2012-10-01 0 weekday
## 2 0.3396226 2012-10-01 5 weekday
## 3 0.1320755 2012-10-01 10 weekday
## 4 0.1509434 2012-10-01 15 weekday
## 5 0.0754717 2012-10-01 20 weekday
## 6 2.0943396 2012-10-01 25 weekday
interval_full <- data_full %>%
group_by(interval, weektype) %>%
summarize(steps = mean(steps))
## `summarise()` has grouped output by 'interval'. You can override using the `.groups` argument.
ggplot(interval_full, aes(x=interval, y=steps, color = weektype)) +
geom_line() +
facet_wrap(~weektype, ncol = 1, nrow=2)
All of the R code needed to reproduce the results (numbers, plots, etc.) in the report is present