Analysis of step data from activity tracker

Loading and preprocessing the data

1. Load libraries required for analysis and set theme for ggplot2

library(dplyr)
library(ggplot2)
library(lubridate)
library(tidyr)

theme1 <- theme(plot.title = element_text(hjust = 0.5,
                                          size = 15),
                panel.background = element_rect(fill = "grey97"),
                panel.grid = element_line(color = "grey88"),
                panel.grid.minor = element_line(color = "grey94"))

2. Load and preprocess data

##  Read data into data frame
data <- read.csv("activity.csv")

## Convert date column into date format
data$date <- as.Date(data$date, format = "%Y-%m-%d")

We look at the forst 25 time intervals to see how they have been labelled. It shows that interval 55 represents 00:55 and interval 100 represents 01:00. This jump is repeated at the end of every hour.

head(data$interval, 25)

##  [1]   0   5  10  15  20  25  30  35  40  45  50  55 100 105 110 115 120
## [18] 125 130 135 140 145 150 155 200

The interval labels are now converted into POSIXct formatted times to removes the jumps in interval at the end of each hour.

##  Duplicate interval column, convert to characters and create column with 
##  character length of each interval value
data$int2 <- data$interval
class(data$int2) <- "character"
data$length <- lapply(data$int2, nchar)

##  Paste 0s where intervals are shorter than 4 digits. This allows strptime to
##  be used to convert intervals into times
data$int2[data$length == 1] <- paste0("000", data$int2[data$length == 1])
data$int2[data$length == 2] <- paste0("00", data$int2[data$length == 2])
data$int2[data$length == 3] <- paste0("0", data$int2[data$length == 3])

##  Convert the intervals into POSIXct time format
data$Time <- as.POSIXct(strptime(data$int2, "%H%M"))

What is the mean total number of steps taken per day?

1. Calculate the total number of steps taken per day

##  Create data frame with sum of steps by day (ignores missing values)
daily_sum <- data %>%
  na.omit() %>%
  group_by(date) %>%
  summarise(dailySum = sum(steps))

daily_sum

## # A tibble: 53 x 2
##    date       dailySum
##    <date>        <int>
##  1 2012-10-02      126
##  2 2012-10-03    11352
##  3 2012-10-04    12116
##  4 2012-10-05    13294
##  5 2012-10-06    15420
##  6 2012-10-07    11015
##  7 2012-10-09    12811
##  8 2012-10-10     9900
##  9 2012-10-11    10304
## 10 2012-10-12    17382
## # … with 43 more rows

2. Histogram of the number of steps taken per day

## Histogram of steps per day
gg_daily_sum <- ggplot(daily_sum, aes(dailySum))

gg_daily_sum + geom_histogram(color = "black", 
                              fill = "lightskyblue3", 
                              breaks = seq(0, 22500, by = 2500)) +
  labs(x = "Steps per day", 
       y = "Freauency", 
       title = "Histogram of steps per day") +
  theme1 + 
  scale_x_continuous(breaks = seq(0, 22500, by = 2500)) +
  scale_y_continuous(breaks = seq(0, 18, by = 2))

3. Mean and median of total number of steps taken per day

##  Mean steps per day
mean_per_day <- mean(daily_sum$dailySum)

##  Median steps per day
median_per_day <- median(daily_sum$dailySum)

Mean	Median
10766.19	10765

What is the average daily activity pattern?

1. Time series plot of the 5-minute intervals and the average number of steps taken, averaged across all days

##  Group data by time interval (keeping interval column as well) and then 
##  calculate the mean number of steps for each interval
int_groups <- data %>%
  group_by(Time, interval) %>%
  summarise(av_step = mean(steps, na.rm = TRUE))

## Time series of intervals and average steps
gg_av_day <- ggplot(int_groups, aes(1:nrow(int_groups),av_step))

gg_av_day + geom_line(color = "lightskyblue4") +
  labs(x = "Time", 
       y = "Average steps", 
       title = "Time series of average steps by 5 minute time interval") +
  theme1 +
  scale_x_continuous(breaks = seq(0,288,by=48), 
                     labels = c("00:00", "04:00", "08:00", 
                                "12:00", "16:00", "20:00", 
                                "00:00"))

2. Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?

##  Time interval with highest number of average steps
int_groups[which.max(int_groups$av_step),2:3]

Interval	Avg. steps
835	206.1698

The 5 minute interval with the highest number of average steps was 835 where on average the individual took 206 steps. This is the 5 minute interval starting at 08:35.

Imputing missing values

1. How many missing values are there in the dataset?

##  How many rows contain NAs in the data?
num_na <- sum(is.na(data$steps)); paste("Number of missing values:", num_na)

## [1] "Number of missing values: 2304"

##  Percentage of dataset
per_na <- num_na / nrow(data) * 100

Missing values
Number	Percentage
2304	13.115

There are 2,304 missing values in the dataset which represents 13.1% of the dataset.

2. Strategy for replacing missing values
As there is significant variation in the number of steps for both time of day and the day of the week, the missing values will be replaced with the average number of steps for the interval on that weekday. There may be bias involved in this method as activity levels on days where data was not collected (presumably when the device was not being worn) may be fundamentaly different to activity levels on days when the device was being worn.

3. Create a new dataset with missing values filled in using method described above

##  Calculate the average steps taken for each interval for each day of the
##  week
avg_by_wday <- data %>%
  group_by(wday(date), interval) %>%
  summarise(avg = mean(steps, na.rm = TRUE))

##  Create a new dataset with missing values replaced by averages for intervals
##  by weekday. This is done by matching both the interval and weekday columsn
##  in the two datasets
imputed <- data
imputed$wd <- wday(imputed$date)
imputed$steps <- ifelse(is.na(imputed$steps),
                        avg_by_wday$avg[match(paste(imputed$interval, 
                                                    imputed$wd), 
                                              paste(avg_by_wday$interval, 
                                                    avg_by_wday$`wday(date)`))],
                        imputed$steps)

4. Histogram of total number of steps (imputed data)

##  Create data frame with both original data and imputed data, summing by day
##  and then gathering by day.
hist <- data.frame(data$date, data$steps, imputed$steps)
colnames(hist) <- c("date", "naSteps", "imptSteps")
hist <- hist %>%
  group_by(date) %>%
  summarise(naStep = sum(naSteps), imptStep = sum(imptSteps))
hist <- gather(hist,
               key = "NAs",
               value = "steps",
               -date)

##  Histogram after imputing missing data
gg_daily_sum_impt <- ggplot(hist, aes(steps, fill = NAs))

gg_daily_sum_impt + geom_histogram(color = "black", 
                                   alpha = 0.6,
                                   breaks = seq(0, 22500, by = 2500),
                                   position = "identity") +
  labs(x = "Steps per day",
       y = "Frequency", 
       title = "Histogram of steps per day") +
  theme1 +
  scale_y_continuous(breaks = seq(0, 22, by = 2)) +
  scale_fill_manual(name = "",
                    labels = c("NAs replaced", "NAs ommited"),
                    values = c("lightskyblue4","lightskyblue3"))

This histogram shows both the original histogram (NAs ommited) and the new one using the imputed data (NAs replaced). The eight days that were omitted from original plot as they had no data are now plotted, occupying the three middle bins ranging from 7,500 to 15,000.

5. Mean and median steps per day (imputed data)

##  Summing imputed data by day
daily_sum_impt <- imputed %>%
  group_by(date) %>%
  summarise(dailySum = sum(steps))

##  Mean steps per day
mean_per_day_impt <- mean(daily_sum_impt$dailySum)

##  Median steps per day
med_per_day_impt <- median(daily_sum_impt$dailySum)

Mean	Median
10821.21	11015

6. Impact of imputing missing values

##  How has the mean changed after imputing the missing data?
mean_diff <- mean_per_day_impt - mean_per_day


##  How has the median changed after imputing the missing data?
median_diff <- med_per_day_impt - median_per_day

Change in mean	Change in median
55.02	250

6.1 Percentage changes

##  Percentage change in median
mean_diff_percent <- (mean_diff / mean_per_day) * 100

##  Percentage change in median
median_diff_percent <- (median_diff / median_per_day) * 100

% change in mean	% change in median
0.51	2.32

Are there differences in activity patterns between weekdays and weekends?

1. Create a factor variable indicating if the day is a weekday or weekend

##  Create a factor variable indicating whether the date is a weekday or on a 
##  weekend
imputed$wd <- as.factor(ifelse(weekdays(imputed$date) 
                                 %in% c("Saturday", "Sunday"),
                                 "weekend", "weekday"))

2. Panel plot of time series for weekdays and weekends

##  Group the imputed data by interval and then weekday factor
weekday_impt <- imputed %>%
  group_by(Time, wd) %>%
  summarise(int_av = mean(steps))

##  Line plots of interval averages for weekdays and weekends
gg_wk <- ggplot(weekday_impt, aes(1:nrow(weekday_impt), int_av, color = wd))

gg_wk + geom_line() + 
  facet_grid(wd~.) +
  labs(x = "Time", 
       y = "Average steps", 
       title = "Average steps per interval: weekdays vs. weekends") +
  theme1 +
  theme(legend.position = "none") +
  scale_x_continuous(breaks = seq(0,576,by=96), 
                     labels = c("00:00", "04:00", "08:00", 
                                "12:00", "16:00", "20:00", 
                                "00:00")) +
  scale_color_manual(values = c("lightskyblue4","firebrick4"))

The time series plot shows that a greater number of steps are taken early in the morning on weekdays, however there are a greater number of steps taken throughout the day on weekends. The table below shows that on average the individual takes approximately 2,000 more steps on days at the weekend compared to days during the week.

##  Average number of steps on weekdays and on weekends
imputed %>%
  group_by(date, wd) %>%
  summarise(sum_steps = sum(steps)) %>%
  group_by(wd) %>%
  summarise(av_steps_per_day = mean(sum_steps))

	Avg. steps per day
weekday	10257.53
weekend	12406.57