Loading and preprocessing the data

Import the data

activity <- read.table(file = unz("activity.zip", "activity.csv"), 
                       header = TRUE, 
                       sep = ",", 
                       colClasses = c("integer", 
                                      "Date", 
                                      "integer"))

Modify the data

## use the dplyr package
suppressMessages(library(dplyr))

## create more important time based fields
activity <- activity %>% 
    mutate(datetime = as.POSIXct((floor(interval/100)*60 + 
                                      interval %% 100)*60, 
                                 origin = date, 
                                 tz = "UTC"), 
           minutes = floor(interval/100)*60 + interval %% 100, 
           time = format(datetime, "%H:%M")) %>% 
    arrange(date, interval)


What is mean total number of steps taken per day?

First, sum the number of steps taken to the daily level

daily <- activity %>% 
    group_by(date) %>% 
    summarise(numSteps = sum(steps, na.rm = TRUE)) %>% 
    arrange(date)

Next, create a histogram of the total number of steps taken each day

## use the ggplo2 package (scales for axis text)
suppressMessages(library(ggplot2))
suppressMessages(library(scales))

## histogram
ggplot(data = daily, 
       aes(x = numSteps)) + 
    geom_histogram(col = "black", 
                   fill = "steelblue", 
                   binwidth = 1000) + 
    xlab("Total Number of Steps Taken Each Day") + 
    ylab("Number of Days") + 
    scale_x_continuous(limits = c(0, 25000), 
                       breaks = seq(0, 25000, 5000), 
                       labels = comma) + 
    scale_y_continuous(limits = c(0, 10), 
                       breaks = seq(0, 10, 1), 
                       labels = comma)


Finally, report the mean and median number of steps taken per day:
- The mean is 9,354
- The median is 10,395

mean(daily$numSteps)
## [1] 9354.23
median(daily$numSteps)
## [1] 10395


What is the average daily activity pattern?

First, average the number of steps taken to the five minute interval level

interval <- activity %>% 
    group_by(minutes, time) %>%  
    summarise(avgSteps = mean(steps, na.rm = TRUE)) %>% 
    arrange(minutes, time)

Next, create a time series plot of the average steps taken by time interval

ggplot(data = interval, 
       aes(x = minutes, 
           y = avgSteps)) + 
    geom_line(col = "black") + 
    xlab("Time of Day (hh:mm)") + 
    ylab("Average Number of Steps") + 
    scale_x_continuous(limits = c(0, 1435), 
                       breaks = c(seq(0, 1400, 120), 1435), 
                       labels = format(as.POSIXct(c(seq(0, 1400, 120), 1435)*60, 
                                                  origin = as.Date(Sys.time()), 
                                                  tz = "UTC"), 
                                       "%H:%M")) + 
    scale_y_continuous(limits = c(0, 225), 
                       breaks = seq(0, 225, 25), 
                       labels = comma)


Finally, report the time interval with the maximum number of average steps

## identify observation
t <- interval %>% 
    inner_join(data.frame(minutes = interval$minutes, 
                          avgRank = dense_rank(interval$avgSteps)), 
               "minutes") %>% 
    as.data.frame() %>% 
    arrange(desc(avgRank)) %>% 
    filter(row_number() == 1) %>% 
    select(minutes, time, avgSteps)

## print using xtable
suppressMessages(library(xtable))
print(xtable(t), 
      type = "html", 
      include.rownames = FALSE)
minutes time avgSteps
515.00 08:35 206.17


Imputing missing values

First, report the number of rows in the original data with missing values:
- The count is 2,304

sum(is.na(activity$steps))
## [1] 2304

Then, using the average # of steps per interval, create a new dataset filling in the missing values

act <- activity %>% 
    inner_join(interval, c("minutes", "time")) %>% 
    mutate(steps = ifelse(is.na(steps), 
                          avgSteps, 
                          steps)) %>% 
    select(steps, date, interval, datetime, minutes, time)

Next, sum the number of steps taken to the daily level

day <- act %>% 
    group_by(date) %>% 
    summarise(numSteps = sum(steps)) %>% 
    arrange(date)

Now, create a new histogram of the total number of steps taken each day

## histogram
ggplot(data = day, 
       aes(x = numSteps)) + 
    geom_histogram(col = "black", 
                   fill = "steelblue", 
                   binwidth = 1000) + 
    xlab("Total Number of Steps Taken Each Day") + 
    ylab("Number of Days") + 
    scale_x_continuous(limits = c(0, 25000), 
                       breaks = seq(0, 25000, 5000), 
                       labels = comma) + 
    scale_y_continuous(limits = c(0, 10), 
                       breaks = seq(0, 10, 1), 
                       labels = comma)


Finally, report the new mean and median number of steps taken per day:
- The mean is 10,766
- The median is 10,766

mean(day$numSteps)
## [1] 10766.19
median(day$numSteps)
## [1] 10766.19

These new mean and median values do differ from the originals. After imputing, the daily totals become less right skewed and more normally distributed. The average increased by 1,412 steps and the median increased by 371 steps.

Are there differences in activity patterns between weekdays and weekends?

Using, the data with imputed values, create factor to distinguish weekdays vs. weekend days

## create new data frame
act2 <- act %>% 
    mutate(dayType = as.factor(ifelse(as.integer(format(date, 
                                                        "%u")) %in% (1:5), 
                                      "weekday", 
                                      "weekend")))

## quickly see counts by day type
summary(act2$dayType)
## weekday weekend 
##   12960    4608

Then, average the number of steps taken to the day type / five minute interval level

int <- act2 %>% 
    group_by(dayType, minutes, time) %>%  
    summarise(avgSteps = mean(steps, na.rm = TRUE)) %>% 
    arrange(dayType, minutes, time)

Next, create a panel (faceted by day type) time series plot of the average steps taken by time interval

ggplot(data = int, 
       aes(x = minutes, 
           y = avgSteps)) + 
    geom_line(col = "black") + 
    xlab("Time of Day (hh:mm)") + 
    ylab("Average Number of Steps") + 
    scale_x_continuous(limits = c(0, 1435), 
                       breaks = c(seq(0, 1400, 120), 1435), 
                       labels = format(as.POSIXct(c(seq(0, 1400, 120), 1435)*60, 
                                                  origin = as.Date(Sys.time()), 
                                                  tz = "UTC"), 
                                       "%H:%M")) + 
    scale_y_continuous(limits = c(0, 250), 
                       breaks = seq(0, 250, 50), 
                       labels = comma) + 
    facet_wrap(~ dayType, ncol = 1)