# set defaults: cache chunks
knitr::opts_chunk$set(cache=TRUE)

Loading and preprocessing the data

The data was obtained by forking the project git repository and unzipping the file ‘activity.zip’ to ./data/raw and read into dataframe variable df.

unzip('activity.zip', exdir = "./data/raw")
df <- read.csv('./data/raw/activity.csv')

Load the necessary libraries for this report:

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(numform)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
# set ggplot title default to centre justified
theme_update(plot.title = element_text(hjust = 0.5))

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(moments)
library(cowplot)
## 
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
## 
##     stamp

The interval column is converted into HH:mm format while the date column is converted into date type:

df <- df %>% 
  mutate(interval = sapply(interval, function(xtime) {
    xtime <- f_pad_zero(xtime, 4)
    xtime <- paste0(substr(xtime, 1, 2), ':', substr(xtime, 3, 4))
  })) %>%
  mutate(date = as_date(date))

head(df)
##   steps       date interval
## 1    NA 2012-10-01    00:00
## 2    NA 2012-10-01    00:05
## 3    NA 2012-10-01    00:10
## 4    NA 2012-10-01    00:15
## 5    NA 2012-10-01    00:20
## 6    NA 2012-10-01    00:25

What is mean total number of steps taken per day?

Calculate and report the mean, median and total step taken each day

steps_by_day <- df %>% 
  group_by(date) %>% 
  summarise(
    Mean=round(mean(steps, na.rm=T),2), 
    Median=round(median(steps, na.rm=T),2),
    Total=sum(steps)
    ) %>%
  mutate(Mean = ifelse(is.nan(Mean), NA, Mean))

steps_by_day %>% 
  kbl() %>% 
  kable_styling(
    bootstrap_options = "striped", 
    full_width = F
    )
date Mean Median Total
2012-10-01 NA NA NA
2012-10-02 0.44 0 126
2012-10-03 39.42 0 11352
2012-10-04 42.07 0 12116
2012-10-05 46.16 0 13294
2012-10-06 53.54 0 15420
2012-10-07 38.25 0 11015
2012-10-08 NA NA NA
2012-10-09 44.48 0 12811
2012-10-10 34.38 0 9900
2012-10-11 35.78 0 10304
2012-10-12 60.35 0 17382
2012-10-13 43.15 0 12426
2012-10-14 52.42 0 15098
2012-10-15 35.20 0 10139
2012-10-16 52.38 0 15084
2012-10-17 46.71 0 13452
2012-10-18 34.92 0 10056
2012-10-19 41.07 0 11829
2012-10-20 36.09 0 10395
2012-10-21 30.63 0 8821
2012-10-22 46.74 0 13460
2012-10-23 30.97 0 8918
2012-10-24 29.01 0 8355
2012-10-25 8.65 0 2492
2012-10-26 23.53 0 6778
2012-10-27 35.14 0 10119
2012-10-28 39.78 0 11458
2012-10-29 17.42 0 5018
2012-10-30 34.09 0 9819
2012-10-31 53.52 0 15414
2012-11-01 NA NA NA
2012-11-02 36.81 0 10600
2012-11-03 36.70 0 10571
2012-11-04 NA NA NA
2012-11-05 36.25 0 10439
2012-11-06 28.94 0 8334
2012-11-07 44.73 0 12883
2012-11-08 11.18 0 3219
2012-11-09 NA NA NA
2012-11-10 NA NA NA
2012-11-11 43.78 0 12608
2012-11-12 37.38 0 10765
2012-11-13 25.47 0 7336
2012-11-14 NA NA NA
2012-11-15 0.14 0 41
2012-11-16 18.89 0 5441
2012-11-17 49.79 0 14339
2012-11-18 52.47 0 15110
2012-11-19 30.70 0 8841
2012-11-20 15.53 0 4472
2012-11-21 44.40 0 12787
2012-11-22 70.93 0 20427
2012-11-23 73.59 0 21194
2012-11-24 50.27 0 14478
2012-11-25 41.09 0 11834
2012-11-26 38.76 0 11162
2012-11-27 47.38 0 13646
2012-11-28 35.36 0 10183
2012-11-29 24.47 0 7047
2012-11-30 NA NA NA

Make a histogram of the total number of steps taken each day:

hist_withNA <- ggplot(steps_by_day, aes(Total)) + 
  geom_histogram(bins=8, na.rm=T) +
  ggtitle("Histogram of Total Steps Each Day") +
  xlab("Daily Step Count") +
  ylab("Frequency")
hist_withNA

What is the average daily activity pattern?

Make a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)

steps_by_interval <- df %>% 
  group_by(interval) %>% 
  summarise(
    Mean=round(mean(steps, na.rm=T),2)
  )

every_nth = function(n) {
  return(function(x) {x[c(TRUE, rep(FALSE, n - 1))]})
}

ggplot(steps_by_interval, aes(y=Mean, x=interval, group=1)) + 
  geom_line() +
  ggtitle("Mean Steps by Time Interval") +
  ylab("Mean Steps per Time Interval") +
  theme(axis.text.x = element_text(angle=45, hjust = 1)) +
  scale_x_discrete(breaks = every_nth(n = 12))

Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?

From the data, the peak mean step count occurs in the 08:35 time interval, part of a sustained peak between 08:00 and 09:20.

There are smaller peaks at around 12:00, 16:00 and 18:45.

Imputing missing values

Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)

na_count <- sum(is.na(df$steps))
na_count
## [1] 2304

There are 2304 missing values.

Devise a strategy for filling in all of the missing values in the dataset. The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.

The step count is most likely influenced by both time of day and day of the week (weekday or weekend).

Solution is to calculate the mean step count of the time interval and week day type that each NA value falls on.

First create a dataframe with those values:

df_day_type <- df
df_day_type$day_type <- factor(ifelse(wday(df_day_type$date)<6, "Weekday", "Weekend"))
mean_by_daytype <- df_day_type %>%
  group_by(day_type, tod=interval) %>%
  summarise(Mean = mean(steps, na.rm = T))
## `summarise()` has grouped output by 'day_type'. You can override using the `.groups` argument.

Then create a function that takes an observation (row) and determines if the step count is NA. If it is, the mean_by_daytype dataframe is used to calculate the fill-in value, otherwise the original step count is returned.

get_step_estimate <- function(observation, dmbw) {
  if (is.na(observation['steps'])){
    estimate <- dmbw %>% 
      subset(day_type==observation['day_type'] & tod==observation['interval'])
    estimate$Mean
  } else {
    observation['steps']
  }
}

Create a new dataset that is equal to the original dataset but with the missing data filled in.

Tying these two together using apply, we create a new dataframe df_noNA with the filled-in values:

df_noNA <- df_day_type %>%
  mutate(
    steps = as.numeric(apply(
      df_day_type, 
      1, 
      get_step_estimate, 
      dmbw=mean_by_daytype
    ))
  ) %>%
  select(-day_type)
head(df_noNA)
##       steps       date interval
## 1 2.3333333 2012-10-01    00:00
## 2 0.4615385 2012-10-01    00:05
## 3 0.1794872 2012-10-01    00:10
## 4 0.2051282 2012-10-01    00:15
## 5 0.1025641 2012-10-01    00:20
## 6 2.8461538 2012-10-01    00:25

Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day.

A repeat of the calculation of steps by day needs to be performed on the dataset with calculated values.

steps_by_day_noNA <- df_noNA %>% 
  group_by(date) %>% 
  summarise(
    Mean=round(mean(steps, na.rm=T),2), 
    Median=round(median(steps, na.rm=T),2),
    Total=round(sum(steps))
  )

Show the original histogram alongside the histogram of the new calculated dataset for comparison:

# Create the histogram with calculated values
hist_noNA <- ggplot(steps_by_day_noNA, aes(Total)) + 
  geom_histogram(bins=8, na.rm=T) +
  ggtitle("Histogram of Total Steps Each Day") +
  xlab("Daily Step Count") +
  ylab("Frequency")

# Find the maximum frequency of both histograms, use the larger as the y axis limit
# Do this to make a like-for-like sisde by side comparison
with_NA_max_f <- max(ggplot_build(hist_withNA)$data[[1]]$ymax)
no_NA_max_f <- max(ggplot_build(hist_noNA)$data[[1]]$ymax)
y_lim <- max(c(with_NA_max_f, no_NA_max_f))

# Create a plot grid, use as a nested grid under a common title
hists <- plot_grid(
  hist_withNA + ylim(0, y_lim+1) + ggtitle("With Missing Values"), 
  hist_noNA + ylim(0, y_lim+1) + ggtitle("With Calculated Values")
  )

# Create the common title
title <- ggdraw() + 
  draw_label(
    "Histogram of Total Steps by Day",
    fontface = 'bold',
    x = 0,
    hjust = -0.5
  ) +
  theme(
    plot.margin = margin(0, 0, 0, 0)
  )

# Show the histograms under common title
plot_grid(
  title, hists,
  ncol = 1,
  # rel_heights values control vertical title margins
  rel_heights = c(0.1, 1)
)

steps_by_day_noNA %>% 
  kbl() %>% 
  kable_styling(
    bootstrap_options = "striped", 
    full_width = F
    )
date Mean Median Total
2012-10-01 35.29 26.35 10163
2012-10-02 0.44 0.00 126
2012-10-03 39.42 0.00 11352
2012-10-04 42.07 0.00 12116
2012-10-05 46.16 0.00 13294
2012-10-06 53.54 0.00 15420
2012-10-07 38.25 0.00 11015
2012-10-08 35.29 26.35 10163
2012-10-09 44.48 0.00 12811
2012-10-10 34.38 0.00 9900
2012-10-11 35.78 0.00 10304
2012-10-12 60.35 0.00 17382
2012-10-13 43.15 0.00 12426
2012-10-14 52.42 0.00 15098
2012-10-15 35.20 0.00 10139
2012-10-16 52.38 0.00 15084
2012-10-17 46.71 0.00 13452
2012-10-18 34.92 0.00 10056
2012-10-19 41.07 0.00 11829
2012-10-20 36.09 0.00 10395
2012-10-21 30.63 0.00 8821
2012-10-22 46.74 0.00 13460
2012-10-23 30.97 0.00 8918
2012-10-24 29.01 0.00 8355
2012-10-25 8.65 0.00 2492
2012-10-26 23.53 0.00 6778
2012-10-27 35.14 0.00 10119
2012-10-28 39.78 0.00 11458
2012-10-29 17.42 0.00 5018
2012-10-30 34.09 0.00 9819
2012-10-31 53.52 0.00 15414
2012-11-01 35.29 26.35 10163
2012-11-02 36.81 0.00 10600
2012-11-03 36.70 0.00 10571
2012-11-04 35.29 26.35 10163
2012-11-05 36.25 0.00 10439
2012-11-06 28.94 0.00 8334
2012-11-07 44.73 0.00 12883
2012-11-08 11.18 0.00 3219
2012-11-09 43.22 22.29 12448
2012-11-10 43.22 22.29 12448
2012-11-11 43.78 0.00 12608
2012-11-12 37.38 0.00 10765
2012-11-13 25.47 0.00 7336
2012-11-14 35.29 26.35 10163
2012-11-15 0.14 0.00 41
2012-11-16 18.89 0.00 5441
2012-11-17 49.79 0.00 14339
2012-11-18 52.47 0.00 15110
2012-11-19 30.70 0.00 8841
2012-11-20 15.53 0.00 4472
2012-11-21 44.40 0.00 12787
2012-11-22 70.93 0.00 20427
2012-11-23 73.59 0.00 21194
2012-11-24 50.27 0.00 14478
2012-11-25 41.09 0.00 11834
2012-11-26 38.76 0.00 11162
2012-11-27 47.38 0.00 13646
2012-11-28 35.36 0.00 10183
2012-11-29 24.47 0.00 7047
2012-11-30 43.22 22.29 12448

Do these values differ from the estimates from the first part of the assignment?

The most common daily counts (i.e. the modal values) appear to have been disproportionally stretched, while the tails appear unchanged. Since we are adding average values, this is expected.

The skewness of the data increases for the dataset with calculated values indicating the mean and median have been pulled to the right.This is in keeping with the above observation as the most common values are higher than the mean and median.

skewness(df$steps, na.rm=T)
## [1] 4.171477
skewness(df_noNA$steps)
## [1] 4.337421

What is the impact of imputing missing data on the estimates of the total daily number of steps?

Total number of steps have expectedly risen given that most NA values were replaced with values greater than zero.

total_with_na <- sum(df$steps, na.rm = T)
total_with_na
## [1] 570608
total_no_na <- as.integer(sum(df_noNA$steps))
total_no_na
## [1] 658763

Total steps with NAs: 570608

Total steps with calculated values: 658763

Increase: 88155

Looking at the increase per day, it appears to have only affected the overall count for days where all values were NA. We can infer (presuming the calculations are correct) that any NA values with partial step counts occurred when the mean step count for that time/day of week were also 0.

raw_step_count <- steps_by_day$Total
# NA values count zero towards the sum
raw_step_count[is.na(raw_step_count)] <- 0
daily_diff <- steps_by_day_noNA$Total - raw_step_count
daily_diff
##  [1] 10163     0     0     0     0     0     0 10163     0     0     0     0
## [13]     0     0     0     0     0     0     0     0     0     0     0     0
## [25]     0     0     0     0     0     0     0 10163     0     0 10163     0
## [37]     0     0     0 12448 12448     0     0     0 10163     0     0     0
## [49]     0     0     0     0     0     0     0     0     0     0     0     0
## [61] 12448

Are there differences in activity patterns between weekdays and weekends?

Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.

df$day_type <- factor(ifelse(wday(df$date)<6, "Weekday", "Weekend"))
head(df)
##   steps       date interval day_type
## 1    NA 2012-10-01    00:00  Weekday
## 2    NA 2012-10-01    00:05  Weekday
## 3    NA 2012-10-01    00:10  Weekday
## 4    NA 2012-10-01    00:15  Weekday
## 5    NA 2012-10-01    00:20  Weekday
## 6    NA 2012-10-01    00:25  Weekday

Make a panel plot containing a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).

steps_by_interval <- df %>% 
  group_by(day_type, interval) %>% 
  summarise(
    Mean=round(mean(steps, na.rm=T),2)
  )
## `summarise()` has grouped output by 'day_type'. You can override using the `.groups` argument.
ggplot(steps_by_interval, aes(y=Mean, x=interval, group=1)) + 
  geom_line() +
  ggtitle("Mean Steps by Time Interval") +
  ylab("Mean Steps per Time Interval") +
  theme(axis.text.x = element_text(angle=45, hjust = 1)) +
  scale_x_discrete(breaks = every_nth(n = 12)) +
  xlab("Time of Day") +
  facet_wrap(~ day_type, ncol = 1)

The initial comparison shows that weekdays have more activity in the early part of the day (6am - 8am), however the weekends show a bigger peak between 8am and 10am and more sustained activity during the afternoons and early evening.

By looking at the mean total steps each day by weekend or weekday, we can see weekend days are on average approximately 20% more active than weekdays.

df %>% 
  group_by(day_type) %>% 
  summarise(
    Mean=round(mean(steps, na.rm=T),2)
  )
## # A tibble: 2 x 2
##   day_type  Mean
##   <fct>    <dbl>
## 1 Weekday   35.3
## 2 Weekend   43.2