Reproducible Research: Peer Assessment 1

setwd("~/Assignments/RepData_PeerAssessment1")

Libraries

Load all the libraries we’ll be using

library(knitr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(reshape2)
library(data.table)

Loading a Processing the Data

About the initial data set

Dataset: Activity monitoring data [52K]

The variables included in this dataset are:

steps: Number of steps taking in a 5-minute interval (missing values are coded as NA)
date: The date on which the measurement was taken in YYYY-MM-DD format
interval: Identifier for the 5-minute interval in which measurement was taken

The dataset is stored in a comma-separated-value (CSV) file and there are a total of 17,568 observations in this dataset.

Download the data

Check to see if the file is already downloaded and unzipped. If not, fetch it from its URL

if (!file.exists ("activity.zip") ){download.file("https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip", "activity.zip")
}

if(!file.exists("activity.csv")){unzip("activity.zip")}

Read the data

Read the data into a master data frame variable activity. We’ll create temporary data frames based on the initial data set.

activity<-read.csv("activity.csv", stringsAsFactors = FALSE, na.strings = c("NA",""))
str(activity)

## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

For some explorations of this data, we will need to know which day of the week each record comes from as well as whether or not that day is a weekday or weekend.

Create additional variables:

weekdays: Factor Days of the week as string. This variable is calculated from the date variable. created with seven levels and classed “Monday” through “Sunday” so that they will order by day-of-week rather than the default alphabetical order.
workday: Factor with two levels, calculated from the newly created weekday variable.
0: weekend
1: weekday

activity<-mutate(.data = activity, date=as.Date(date), interval=as.numeric(interval), 
             weekdays=factor(weekdays.Date(date),levels = c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday")), 
             workday=factor(ifelse(weekdays=="Saturday"|weekdays=="Sunday", 0, 1),levels = c(0, 1)))

str(activity)

## 'data.frame':    17568 obs. of  5 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: num  0 5 10 15 20 25 30 35 40 45 ...
##  $ weekdays: Factor w/ 7 levels "Monday","Tuesday",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ workday : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

What is mean total number of steps taken per day?

Using the dplyr package, group and summarize the data by date and assign to a new data frame called totalsteps

In this first exploration, we’ll leave out missing values from the calculation of mean and median.

Calculate mean steps, store in variable meansteps. Calculate median steps, store in variable mediansteps

totalsteps<-activity
totalsteps<-select(.data = totalsteps, date, steps)%>%
        group_by(date)%>%
        summarise(dailysteps=sum(steps, na.rm=T))%>%
        as.data.frame()

meansteps<-mean(totalsteps$dailysteps, na.rm=T)
mediansteps<-median(totalsteps$dailysteps, na.rm=T)

Generate a histogram of totalsteps with 20 breaks using the base graphic package with a mean line in red and a median line in blue

hist(totalsteps$dailysteps, breaks=20, xlab="Total Daily Steps", ylab="Frequency of Daily Step Count", main = "Histogram of Total Daily Steps", border = "darkolivegreen")
abline(v=meansteps, col="red", lwd=2)
abline(v=mediansteps, col="blue", lwd=2)
text(labels = paste("mean", "\n", as.integer(meansteps), collapse = "\n"), x=meansteps, y=8, pos=2, col="red")
text(labels = paste("median", "\n", mediansteps, collapse = "\n"), x=mediansteps+500, y=8, pos=4, col="blue")

rm(totalsteps) #remove the dataset from memory

What is the average daily activity pattern?

Since the individual dates don’t matter for this question, we’ll create a data frame dailypattern from activity subsetting only the interval and steps columns.

We’ll create hurry to hold the maximimum value and its corresponding interval.

dailypattern<-select(activity, interval, steps)%>%
        group_by(interval)%>%
        summarise(avgsteps=mean(steps, na.rm=TRUE))%>%
        as.data.frame()

summary1<-summary(dailypattern$avgsteps)

dailymean<-as.integer(mean(dailypattern$avgsteps, na.rm = T))
dailymedian<-median(dailypattern$avgsteps, na.rm = T)
        
dailypattern<-mutate(dailypattern, avgsteps=avgsteps, mean=mean(avgsteps, na.rm=T), median=median(avgsteps, na.rm=T))#add columns for mean and median this will facilitate creating the chart.

dailypattern<-gather(dailypattern, "statistic", "value", 2:4) #all three value columns into one column

dailypattern$statistic<-factor(dailypattern$statistic, levels=c("avgsteps", "mean", "median"), labels=c("Steps", "Mean", "Median")) #class the gathered column as factor with three levels.  Relabel the levels to be more human-friendly.

hurry<-dailypattern[which(dailypattern$value==max(dailypattern$value)), c("interval", "value")]#find the interval with the maximum value.

The interval with the maximum mean steps is:

hurry$interval

## [1] 835

Which corresponds to 8:35 a.m.

ggplot(dailypattern, aes(x=interval, y=value, group=statistic, col=statistic))+
        ylab("Number of Steps per Interval")+
        xlab("Time of Day")+
        geom_line()+
        geom_line(aes(x=hurry$interval), color="purple")+
        annotate("text", x=hurry$interval-50, y=hurry$value, label=as.integer(hurry$value), col="purple", cex=4)+
        scale_color_manual(values=c("darkolivegreen", "red", "blue"))+
        scale_x_continuous(breaks = c(0, 300, 600, hurry$interval, 1200, 1500, 1800, 2100, 2355 ), labels=c("midnight","3:00", "6:00", "8:35", "noon", "15:00", "18:00", "21:00", "midnight"))

Imputing missing values

Two strategies for imputing missing values include:
1. find the mean and median of each interval or
2. find the mean and median of each interval per weekday.

We’ve already seen the what an average day looks like (above), let’s compare averages per weekday to see if any patterns emerge.

intervalpattern<-select(activity, weekdays, interval, steps)%>%
        group_by(weekdays, interval)%>%
        summarise(steps = sum(steps, na.rm=TRUE))%>%
        as.data.frame()

ggplot(intervalpattern, aes(x=interval, y=steps, group=weekdays, fill=weekdays, col=weekdays) )+
        geom_line()+
        facet_grid(weekdays~.)+
scale_x_continuous(breaks = c(0, 300, 600, 900, 1200, 1500, 1800, 2100, 2355 ), labels=c("midnight","3:00", "6:00", "9:00", "noon", "15:00", "18:00", "21:00", "midnight"))+
       theme_bw()

There does appear to be a difference in the activity patterns per weekday.

It’s notable that Monday through Wednesday have similar shapes, as do Thursday through Saturday. Sunday has a “smoother” profile lacking the extreme spikes of activity that we see on the other six days.

Because of these daily differences, we’ll impute the missing data by interval by weekday.

Create a data frame called missing by selecting the weekdays, interval and steps columns from the activity data frame.

Take the mean of each interval per weekday. This should produce 2016 objects (288 intervals X 7 days).

missing<-select(activity,  weekdays, interval, steps)%>%
        group_by(weekdays, interval)%>%
        summarise(msteps=mean(steps, na.rm=T))%>%
        as.data.frame()

str(missing)

## 'data.frame':    2016 obs. of  3 variables:
##  $ weekdays: Factor w/ 7 levels "Monday","Tuesday",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: num  0 5 10 15 20 25 30 35 40 45 ...
##  $ msteps  : num  1.43 0 0 0 0 ...

create a new data frame from the activity data set and call it completesteps.

Left-join a column to add the mean steps per interval per day as msteps.

completesteps<-activity
completesteps<-left_join(x = completesteps, y=missing, by=c("weekdays", "interval"))
str(completesteps)

## 'data.frame':    17568 obs. of  6 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: num  0 5 10 15 20 25 30 35 40 45 ...
##  $ weekdays: Factor w/ 7 levels "Monday","Tuesday",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ workday : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ msteps  : num  1.43 0 0 0 0 ...

Find the records that have missing data and substitute in the value from msteps.

completesteps$steps<-ifelse(!is.na(completesteps$steps), completesteps$steps, completesteps$msteps)

newsteps<-select(.data = completesteps, date, steps)%>%
        group_by(date)%>%
        summarise(dailysteps=sum(steps, na.rm=T))%>%
        as.data.frame()

newmeansteps<-mean(newsteps$dailysteps, na.rm=T)
newmediansteps<-median(newsteps$dailysteps, na.rm=T)

hist(newsteps$dailysteps, breaks=20, xlab="Total Daily Steps", ylab="Frequency of Daily Step Count", main = "Histogram of Total Daily Steps", border = "darkolivegreen")
abline(v=newmeansteps, col="red", lwd=2)
abline(v=newmediansteps, col="blue", lwd=2)
text(labels = paste("mean", "\n", as.integer(newmeansteps), collapse = "\n"), x=newmeansteps, y=8, pos=2, col="red")
text(labels = paste("median", "\n", newmediansteps, collapse = "\n"), x=newmediansteps+500, y=8, pos=4, col="blue")

Are there differences in activity patterns between weekdays and weekends?

Using our new, complete data set, we create our final data set by first subsetting completedata into two new datasets, workweek and weekend.

workweek is extracted by filting in rows where workday == 1, adding the column day and filling in with string data, “work”

weekend is extracted by filtering in rows where workday ==0, adding a new column day and filling in with string data, “weekend”

Our final dataset, week is creating by rbinding workweek and weekend

workweek<-select(completesteps, workday, interval, steps)%>%
        filter(workday==1)%>%
        group_by(interval)%>%
        summarise(day="work", steps=mean(steps, na.rm=TRUE))%>%
        as.data.frame()

mworkweek<-mean(workweek$steps)

weekend<-select(completesteps, workday, interval, steps)%>%
        filter(workday==0)%>%
        group_by(interval)%>%
        summarise(day="weekend", steps=mean(steps, na.rm=TRUE))%>%
        as.data.frame()

mweekend<-mean(weekend$steps)

week<-rbind(workweek, weekend)

For the final plot, I used stat_smooth to make the lines easier to read. The general pattern is still discernable, but a bit easier on the eyes.

ggplot(week, aes(x=interval, y=steps, group=day, col=day))+
        stat_smooth(aes(x = interval), se = F, method = "lm", formula = y ~ poly(x, 20), lwd=1) +
         scale_x_continuous(breaks = c(0, 300, 600, 900, 1200, 1500, 1800, 2100, 2355), labels=c("midnight","3:00", "6:00","9:00" ,"noon", "15:00", "18:00", "21:00", "midnight"))+
        theme_bw()