Reproducible Research Assignment 1

’’’ This assignment makes use of data from a personal activity monitoring device. This device collects data at 5 minute intervals through out the day. The data consists of two months of data from an anonymous individual collected during the months of October and November, 2012 and include the number of steps taken in 5 minute intervals each day.

The variables included in this dataset are:

steps: Number of steps taking in a 5-minute interval (missing values are coded as NA)
date: The date on which the measurement was taken in YYYY-MM-DD format
interval: Identifier for the 5-minute interval in which measurement was taken

’’’ Loading required libraries

library(knitr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

Question 1: Code for reading in the dataset and/or processing the data

data <- read.csv("activity.csv", header = TRUE, sep = ',', colClasses = c("numeric", "character",
                                                                  "integer"))
data$date <- ymd(data$date)
summary(data)

##      steps             date               interval     
##  Min.   :  0.00   Min.   :2012-10-01   Min.   :   0.0  
##  1st Qu.:  0.00   1st Qu.:2012-10-16   1st Qu.: 588.8  
##  Median :  0.00   Median :2012-10-31   Median :1177.5  
##  Mean   : 37.38   Mean   :2012-10-31   Mean   :1177.5  
##  3rd Qu.: 12.00   3rd Qu.:2012-11-15   3rd Qu.:1766.2  
##  Max.   :806.00   Max.   :2012-11-30   Max.   :2355.0  
##  NA's   :2304

Question 2: What is mean total number of steps taken per day?

steps <- data %>%
  filter(!is.na(steps)) %>%
  group_by(date) %>%
  summarize(steps = sum(steps))

print(steps)

## # A tibble: 53 x 2
##    date       steps
##    <date>     <dbl>
##  1 2012-10-02   126
##  2 2012-10-03 11352
##  3 2012-10-04 12116
##  4 2012-10-05 13294
##  5 2012-10-06 15420
##  6 2012-10-07 11015
##  7 2012-10-09 12811
##  8 2012-10-10  9900
##  9 2012-10-11 10304
## 10 2012-10-12 17382
## # ... with 43 more rows

Question 3: Histogram of the total number of steps taken each day

ggplot(steps, aes(x = steps)) +
  geom_histogram(fill = "coral", binwidth = 1000) +
  labs(title = "Total Steps per day", x = "Steps per day", y = "Frequency")

Question 4: Mean and median number of steps taken each day

print(mean_steps <- mean(steps$steps, na.rm = TRUE))

## [1] 10766.19

print(median_steps <- median(steps$steps, na.rm = TRUE))

## [1] 10765

Question 5: Time series plot of the average number of steps taken

interval <- data %>%
  filter(!is.na(steps)) %>%
  group_by(interval) %>%
  summarize(steps = mean(steps))

interval

## # A tibble: 288 x 2
##    interval  steps
##       <int>  <dbl>
##  1        0 1.72  
##  2        5 0.340 
##  3       10 0.132 
##  4       15 0.151 
##  5       20 0.0755
##  6       25 2.09  
##  7       30 0.528 
##  8       35 0.868 
##  9       40 0     
## 10       45 1.47  
## # ... with 278 more rows

ggplot(interval, aes(x=interval, y=steps)) +
  geom_line(color = "darkmagenta")

Question 6: The 5-minute interval that, on average, contains the maximum number of steps

interval[which.max(interval$steps),]

## # A tibble: 1 x 2
##   interval steps
##      <int> <dbl>
## 1      835  206.

Question 7: Code to describe and show a strategy for imputing missing data

data_full <- data
nas <- is.na(data_full$steps)

avg_interval <- tapply(data_full$steps, data_full$interval, mean, na.rm=TRUE, simplify=TRUE)

data_full$steps[nas] <- avg_interval[as.character(data_full$interval[nas])]

steps_full <- data_full %>%
  filter(!is.na(steps)) %>%
  group_by(date) %>%
  summarize(steps = sum(steps))

steps_full

## # A tibble: 61 x 2
##    date        steps
##    <date>      <dbl>
##  1 2012-10-01 10766.
##  2 2012-10-02   126 
##  3 2012-10-03 11352 
##  4 2012-10-04 12116 
##  5 2012-10-05 13294 
##  6 2012-10-06 15420 
##  7 2012-10-07 11015 
##  8 2012-10-08 10766.
##  9 2012-10-09 12811 
## 10 2012-10-10  9900 
## # ... with 51 more rows

Question 8: Histogram of the total number of steps taken each day after missing values are imputed

ggplot(steps_full, aes(x = steps)) +
  geom_histogram(fill = "cornflowerblue", binwidth = 1000) +
  labs(title = "Total number of steps taken each day after missing values are imputed", x = "Steps per day", y = "Frequency")

Question 9: Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends

mean_steps_full <- mean(steps_full$steps, na.rm = TRUE)
median_steps_full <- median(steps_full$steps, na.rm = TRUE)

data_full <- mutate(data_full, weektype = ifelse(weekdays(data_full$date) == "Saturday" | weekdays(data_full$date) == "Sunday", "weekend", "weekday"))

data_full$weektype <- as.factor(data_full$weektype)

head(data_full)

##       steps       date interval weektype
## 1 1.7169811 2012-10-01        0  weekday
## 2 0.3396226 2012-10-01        5  weekday
## 3 0.1320755 2012-10-01       10  weekday
## 4 0.1509434 2012-10-01       15  weekday
## 5 0.0754717 2012-10-01       20  weekday
## 6 2.0943396 2012-10-01       25  weekday

interval_full <- data_full %>%
  group_by(interval, weektype) %>%
  summarize(steps = mean(steps))

## `summarise()` has grouped output by 'interval'. You can override using the `.groups` argument.

ggplot(interval_full, aes(x=interval, y=steps, color = weektype)) +
  geom_line() +
  facet_wrap(~weektype, ncol = 1, nrow=2)

All of the R code needed to reproduce the results (numbers, plots, etc.) in the report is present

Reproducible Research Assignment 1

Anoop

18/07/2021