NA
s)# set defaults: cache chunks
knitr::opts_chunk$set(cache=TRUE)
The data was obtained by forking the project git
repository and unzipping the file ‘activity.zip’ to ./data/raw and
read into dataframe variable df
.
unzip('activity.zip', exdir = "./data/raw")
df <- read.csv('./data/raw/activity.csv')
Load the necessary libraries for this report:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(numform)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
# set ggplot title default to centre justified
theme_update(plot.title = element_text(hjust = 0.5))
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(moments)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:lubridate':
##
## stamp
The interval column is converted into HH:mm format while the date column is converted into date type:
df <- df %>%
mutate(interval = sapply(interval, function(xtime) {
xtime <- f_pad_zero(xtime, 4)
xtime <- paste0(substr(xtime, 1, 2), ':', substr(xtime, 3, 4))
})) %>%
mutate(date = as_date(date))
head(df)
## steps date interval
## 1 NA 2012-10-01 00:00
## 2 NA 2012-10-01 00:05
## 3 NA 2012-10-01 00:10
## 4 NA 2012-10-01 00:15
## 5 NA 2012-10-01 00:20
## 6 NA 2012-10-01 00:25
steps_by_day <- df %>%
group_by(date) %>%
summarise(
Mean=round(mean(steps, na.rm=T),2),
Median=round(median(steps, na.rm=T),2),
Total=sum(steps)
) %>%
mutate(Mean = ifelse(is.nan(Mean), NA, Mean))
steps_by_day %>%
kbl() %>%
kable_styling(
bootstrap_options = "striped",
full_width = F
)
date | Mean | Median | Total |
---|---|---|---|
2012-10-01 | NA | NA | NA |
2012-10-02 | 0.44 | 0 | 126 |
2012-10-03 | 39.42 | 0 | 11352 |
2012-10-04 | 42.07 | 0 | 12116 |
2012-10-05 | 46.16 | 0 | 13294 |
2012-10-06 | 53.54 | 0 | 15420 |
2012-10-07 | 38.25 | 0 | 11015 |
2012-10-08 | NA | NA | NA |
2012-10-09 | 44.48 | 0 | 12811 |
2012-10-10 | 34.38 | 0 | 9900 |
2012-10-11 | 35.78 | 0 | 10304 |
2012-10-12 | 60.35 | 0 | 17382 |
2012-10-13 | 43.15 | 0 | 12426 |
2012-10-14 | 52.42 | 0 | 15098 |
2012-10-15 | 35.20 | 0 | 10139 |
2012-10-16 | 52.38 | 0 | 15084 |
2012-10-17 | 46.71 | 0 | 13452 |
2012-10-18 | 34.92 | 0 | 10056 |
2012-10-19 | 41.07 | 0 | 11829 |
2012-10-20 | 36.09 | 0 | 10395 |
2012-10-21 | 30.63 | 0 | 8821 |
2012-10-22 | 46.74 | 0 | 13460 |
2012-10-23 | 30.97 | 0 | 8918 |
2012-10-24 | 29.01 | 0 | 8355 |
2012-10-25 | 8.65 | 0 | 2492 |
2012-10-26 | 23.53 | 0 | 6778 |
2012-10-27 | 35.14 | 0 | 10119 |
2012-10-28 | 39.78 | 0 | 11458 |
2012-10-29 | 17.42 | 0 | 5018 |
2012-10-30 | 34.09 | 0 | 9819 |
2012-10-31 | 53.52 | 0 | 15414 |
2012-11-01 | NA | NA | NA |
2012-11-02 | 36.81 | 0 | 10600 |
2012-11-03 | 36.70 | 0 | 10571 |
2012-11-04 | NA | NA | NA |
2012-11-05 | 36.25 | 0 | 10439 |
2012-11-06 | 28.94 | 0 | 8334 |
2012-11-07 | 44.73 | 0 | 12883 |
2012-11-08 | 11.18 | 0 | 3219 |
2012-11-09 | NA | NA | NA |
2012-11-10 | NA | NA | NA |
2012-11-11 | 43.78 | 0 | 12608 |
2012-11-12 | 37.38 | 0 | 10765 |
2012-11-13 | 25.47 | 0 | 7336 |
2012-11-14 | NA | NA | NA |
2012-11-15 | 0.14 | 0 | 41 |
2012-11-16 | 18.89 | 0 | 5441 |
2012-11-17 | 49.79 | 0 | 14339 |
2012-11-18 | 52.47 | 0 | 15110 |
2012-11-19 | 30.70 | 0 | 8841 |
2012-11-20 | 15.53 | 0 | 4472 |
2012-11-21 | 44.40 | 0 | 12787 |
2012-11-22 | 70.93 | 0 | 20427 |
2012-11-23 | 73.59 | 0 | 21194 |
2012-11-24 | 50.27 | 0 | 14478 |
2012-11-25 | 41.09 | 0 | 11834 |
2012-11-26 | 38.76 | 0 | 11162 |
2012-11-27 | 47.38 | 0 | 13646 |
2012-11-28 | 35.36 | 0 | 10183 |
2012-11-29 | 24.47 | 0 | 7047 |
2012-11-30 | NA | NA | NA |
hist_withNA <- ggplot(steps_by_day, aes(Total)) +
geom_histogram(bins=8, na.rm=T) +
ggtitle("Histogram of Total Steps Each Day") +
xlab("Daily Step Count") +
ylab("Frequency")
hist_withNA
steps_by_interval <- df %>%
group_by(interval) %>%
summarise(
Mean=round(mean(steps, na.rm=T),2)
)
every_nth = function(n) {
return(function(x) {x[c(TRUE, rep(FALSE, n - 1))]})
}
ggplot(steps_by_interval, aes(y=Mean, x=interval, group=1)) +
geom_line() +
ggtitle("Mean Steps by Time Interval") +
ylab("Mean Steps per Time Interval") +
theme(axis.text.x = element_text(angle=45, hjust = 1)) +
scale_x_discrete(breaks = every_nth(n = 12))
From the data, the peak mean step count occurs in the 08:35 time interval, part of a sustained peak between 08:00 and 09:20.
There are smaller peaks at around 12:00, 16:00 and 18:45.
NA
s)na_count <- sum(is.na(df$steps))
na_count
## [1] 2304
There are 2304 missing values.
The step count is most likely influenced by both time of day and day of the week (weekday or weekend).
Solution is to calculate the mean step count of the time interval and week day type that each NA value falls on.
First create a dataframe with those values:
df_day_type <- df
df_day_type$day_type <- factor(ifelse(wday(df_day_type$date)<6, "Weekday", "Weekend"))
mean_by_daytype <- df_day_type %>%
group_by(day_type, tod=interval) %>%
summarise(Mean = mean(steps, na.rm = T))
## `summarise()` has grouped output by 'day_type'. You can override using the `.groups` argument.
Then create a function that takes an observation (row) and determines
if the step count is NA. If it is, the mean_by_daytype
dataframe is used to calculate the fill-in value, otherwise the original
step count is returned.
get_step_estimate <- function(observation, dmbw) {
if (is.na(observation['steps'])){
estimate <- dmbw %>%
subset(day_type==observation['day_type'] & tod==observation['interval'])
estimate$Mean
} else {
observation['steps']
}
}
Tying these two together using apply
, we create a new
dataframe df_noNA
with the filled-in values:
df_noNA <- df_day_type %>%
mutate(
steps = as.numeric(apply(
df_day_type,
1,
get_step_estimate,
dmbw=mean_by_daytype
))
) %>%
select(-day_type)
head(df_noNA)
## steps date interval
## 1 2.3333333 2012-10-01 00:00
## 2 0.4615385 2012-10-01 00:05
## 3 0.1794872 2012-10-01 00:10
## 4 0.2051282 2012-10-01 00:15
## 5 0.1025641 2012-10-01 00:20
## 6 2.8461538 2012-10-01 00:25
A repeat of the calculation of steps by day needs to be performed on the dataset with calculated values.
steps_by_day_noNA <- df_noNA %>%
group_by(date) %>%
summarise(
Mean=round(mean(steps, na.rm=T),2),
Median=round(median(steps, na.rm=T),2),
Total=round(sum(steps))
)
Show the original histogram alongside the histogram of the new calculated dataset for comparison:
# Create the histogram with calculated values
hist_noNA <- ggplot(steps_by_day_noNA, aes(Total)) +
geom_histogram(bins=8, na.rm=T) +
ggtitle("Histogram of Total Steps Each Day") +
xlab("Daily Step Count") +
ylab("Frequency")
# Find the maximum frequency of both histograms, use the larger as the y axis limit
# Do this to make a like-for-like sisde by side comparison
with_NA_max_f <- max(ggplot_build(hist_withNA)$data[[1]]$ymax)
no_NA_max_f <- max(ggplot_build(hist_noNA)$data[[1]]$ymax)
y_lim <- max(c(with_NA_max_f, no_NA_max_f))
# Create a plot grid, use as a nested grid under a common title
hists <- plot_grid(
hist_withNA + ylim(0, y_lim+1) + ggtitle("With Missing Values"),
hist_noNA + ylim(0, y_lim+1) + ggtitle("With Calculated Values")
)
# Create the common title
title <- ggdraw() +
draw_label(
"Histogram of Total Steps by Day",
fontface = 'bold',
x = 0,
hjust = -0.5
) +
theme(
plot.margin = margin(0, 0, 0, 0)
)
# Show the histograms under common title
plot_grid(
title, hists,
ncol = 1,
# rel_heights values control vertical title margins
rel_heights = c(0.1, 1)
)
steps_by_day_noNA %>%
kbl() %>%
kable_styling(
bootstrap_options = "striped",
full_width = F
)
date | Mean | Median | Total |
---|---|---|---|
2012-10-01 | 35.29 | 26.35 | 10163 |
2012-10-02 | 0.44 | 0.00 | 126 |
2012-10-03 | 39.42 | 0.00 | 11352 |
2012-10-04 | 42.07 | 0.00 | 12116 |
2012-10-05 | 46.16 | 0.00 | 13294 |
2012-10-06 | 53.54 | 0.00 | 15420 |
2012-10-07 | 38.25 | 0.00 | 11015 |
2012-10-08 | 35.29 | 26.35 | 10163 |
2012-10-09 | 44.48 | 0.00 | 12811 |
2012-10-10 | 34.38 | 0.00 | 9900 |
2012-10-11 | 35.78 | 0.00 | 10304 |
2012-10-12 | 60.35 | 0.00 | 17382 |
2012-10-13 | 43.15 | 0.00 | 12426 |
2012-10-14 | 52.42 | 0.00 | 15098 |
2012-10-15 | 35.20 | 0.00 | 10139 |
2012-10-16 | 52.38 | 0.00 | 15084 |
2012-10-17 | 46.71 | 0.00 | 13452 |
2012-10-18 | 34.92 | 0.00 | 10056 |
2012-10-19 | 41.07 | 0.00 | 11829 |
2012-10-20 | 36.09 | 0.00 | 10395 |
2012-10-21 | 30.63 | 0.00 | 8821 |
2012-10-22 | 46.74 | 0.00 | 13460 |
2012-10-23 | 30.97 | 0.00 | 8918 |
2012-10-24 | 29.01 | 0.00 | 8355 |
2012-10-25 | 8.65 | 0.00 | 2492 |
2012-10-26 | 23.53 | 0.00 | 6778 |
2012-10-27 | 35.14 | 0.00 | 10119 |
2012-10-28 | 39.78 | 0.00 | 11458 |
2012-10-29 | 17.42 | 0.00 | 5018 |
2012-10-30 | 34.09 | 0.00 | 9819 |
2012-10-31 | 53.52 | 0.00 | 15414 |
2012-11-01 | 35.29 | 26.35 | 10163 |
2012-11-02 | 36.81 | 0.00 | 10600 |
2012-11-03 | 36.70 | 0.00 | 10571 |
2012-11-04 | 35.29 | 26.35 | 10163 |
2012-11-05 | 36.25 | 0.00 | 10439 |
2012-11-06 | 28.94 | 0.00 | 8334 |
2012-11-07 | 44.73 | 0.00 | 12883 |
2012-11-08 | 11.18 | 0.00 | 3219 |
2012-11-09 | 43.22 | 22.29 | 12448 |
2012-11-10 | 43.22 | 22.29 | 12448 |
2012-11-11 | 43.78 | 0.00 | 12608 |
2012-11-12 | 37.38 | 0.00 | 10765 |
2012-11-13 | 25.47 | 0.00 | 7336 |
2012-11-14 | 35.29 | 26.35 | 10163 |
2012-11-15 | 0.14 | 0.00 | 41 |
2012-11-16 | 18.89 | 0.00 | 5441 |
2012-11-17 | 49.79 | 0.00 | 14339 |
2012-11-18 | 52.47 | 0.00 | 15110 |
2012-11-19 | 30.70 | 0.00 | 8841 |
2012-11-20 | 15.53 | 0.00 | 4472 |
2012-11-21 | 44.40 | 0.00 | 12787 |
2012-11-22 | 70.93 | 0.00 | 20427 |
2012-11-23 | 73.59 | 0.00 | 21194 |
2012-11-24 | 50.27 | 0.00 | 14478 |
2012-11-25 | 41.09 | 0.00 | 11834 |
2012-11-26 | 38.76 | 0.00 | 11162 |
2012-11-27 | 47.38 | 0.00 | 13646 |
2012-11-28 | 35.36 | 0.00 | 10183 |
2012-11-29 | 24.47 | 0.00 | 7047 |
2012-11-30 | 43.22 | 22.29 | 12448 |
The most common daily counts (i.e. the modal values) appear to have been disproportionally stretched, while the tails appear unchanged. Since we are adding average values, this is expected.
The skewness of the data increases for the dataset with calculated values indicating the mean and median have been pulled to the right.This is in keeping with the above observation as the most common values are higher than the mean and median.
skewness(df$steps, na.rm=T)
## [1] 4.171477
skewness(df_noNA$steps)
## [1] 4.337421
Total number of steps have expectedly risen given that most
NA
values were replaced with values greater than zero.
total_with_na <- sum(df$steps, na.rm = T)
total_with_na
## [1] 570608
total_no_na <- as.integer(sum(df_noNA$steps))
total_no_na
## [1] 658763
Total steps with NAs: 570608
Total steps with calculated values: 658763
Increase: 88155
Looking at the increase per day, it appears to have only affected the overall count for days where all values were NA. We can infer (presuming the calculations are correct) that any NA values with partial step counts occurred when the mean step count for that time/day of week were also 0.
raw_step_count <- steps_by_day$Total
# NA values count zero towards the sum
raw_step_count[is.na(raw_step_count)] <- 0
daily_diff <- steps_by_day_noNA$Total - raw_step_count
daily_diff
## [1] 10163 0 0 0 0 0 0 10163 0 0 0 0
## [13] 0 0 0 0 0 0 0 0 0 0 0 0
## [25] 0 0 0 0 0 0 0 10163 0 0 10163 0
## [37] 0 0 0 12448 12448 0 0 0 10163 0 0 0
## [49] 0 0 0 0 0 0 0 0 0 0 0 0
## [61] 12448
df$day_type <- factor(ifelse(wday(df$date)<6, "Weekday", "Weekend"))
head(df)
## steps date interval day_type
## 1 NA 2012-10-01 00:00 Weekday
## 2 NA 2012-10-01 00:05 Weekday
## 3 NA 2012-10-01 00:10 Weekday
## 4 NA 2012-10-01 00:15 Weekday
## 5 NA 2012-10-01 00:20 Weekday
## 6 NA 2012-10-01 00:25 Weekday
steps_by_interval <- df %>%
group_by(day_type, interval) %>%
summarise(
Mean=round(mean(steps, na.rm=T),2)
)
## `summarise()` has grouped output by 'day_type'. You can override using the `.groups` argument.
ggplot(steps_by_interval, aes(y=Mean, x=interval, group=1)) +
geom_line() +
ggtitle("Mean Steps by Time Interval") +
ylab("Mean Steps per Time Interval") +
theme(axis.text.x = element_text(angle=45, hjust = 1)) +
scale_x_discrete(breaks = every_nth(n = 12)) +
xlab("Time of Day") +
facet_wrap(~ day_type, ncol = 1)
The initial comparison shows that weekdays have more activity in the early part of the day (6am - 8am), however the weekends show a bigger peak between 8am and 10am and more sustained activity during the afternoons and early evening.
By looking at the mean total steps each day by weekend or weekday, we can see weekend days are on average approximately 20% more active than weekdays.
df %>%
group_by(day_type) %>%
summarise(
Mean=round(mean(steps, na.rm=T),2)
)
## # A tibble: 2 x 2
## day_type Mean
## <fct> <dbl>
## 1 Weekday 35.3
## 2 Weekend 43.2