1. Loading and processing the data

The data will be stored in a dataframe called, activity and has 17,568 observations from a personal activity monitoring device.

A quick look at the data shows,

In addition, the activity data was transformed to summarize steps by day and by interval, the dailyStats and intervalStats respectively

setwd("C:/Users/Mike/Documents/Projects/dataScience/ReproducibleResearch/RepData_PeerAssessment1")
library(xtable)
library(dplyr)
library(ggplot2)
#read data from file to dataframe...
activity <- read.csv(unz(description = "activity.zip", "activity.csv"))

# transform data to summarize by day and interval...
dailyStats <- group_by(activity, date) %>% summarize(avgSteps = mean(steps,na.rm=TRUE), medianSteps = median(steps,na.rm=TRUE), totalSteps = sum(steps,na.rm=TRUE))

intervalStats <- group_by(activity, interval) %>% summarize(avgSteps = mean(steps,na.rm=TRUE), medianSteps = median(steps,na.rm=TRUE), totalSteps = sum(steps,na.rm=TRUE))

# Add a time element to the interval transformation to make intervals more intuitive (e.g. time of day)
timeInterval <- seq(as.POSIXct("2012-11-30 00:00:00"),as.POSIXct("2012-11-30 23:55:00"),by="5 mins")
intervalStats <- cbind(intervalStats, timeInterval)

head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
head(dailyStats)
## # A tibble: 6 x 4
##         date avgSteps medianSteps totalSteps
##       <fctr>    <dbl>       <dbl>      <int>
## 1 2012-10-01      NaN          NA          0
## 2 2012-10-02  0.43750           0        126
## 3 2012-10-03 39.41667           0      11352
## 4 2012-10-04 42.06944           0      12116
## 5 2012-10-05 46.15972           0      13294
## 6 2012-10-06 53.54167           0      15420
head(intervalStats)
##   interval  avgSteps medianSteps totalSteps        timeInterval
## 1        0 1.7169811           0         91 2012-11-30 00:00:00
## 2        5 0.3396226           0         18 2012-11-30 00:05:00
## 3       10 0.1320755           0          7 2012-11-30 00:10:00
## 4       15 0.1509434           0          8 2012-11-30 00:15:00
## 5       20 0.0754717           0          4 2012-11-30 00:20:00
## 6       25 2.0943396           0        111 2012-11-30 00:25:00

2. Steps taken each day - histogram

The histogram shows that the individual often takes between 10,000 and 15,000 steps in a day, however, looks skewed towards zero steps

hist(dailyStats$totalSteps)

3. Mean/Median Steps taken each day

On average, this individual takes 9354 steps per day. The median number of steps per day is 10395

We see that the daily median number of steps equals zero. This may be because many of the readings are taken during times when an individual wouldnt be walking around. For example, not representing only awake hours. The five minute intervals start near midnight and register zero for many hours, presumably, while the individual is sleeping.

The table below shows, for each day, the total, average, and median number of steps taken

print(xtable(dailyStats), type="html", auto=TRUE)
date avgSteps medianSteps totalSteps
1 2012-10-01 0
2 2012-10-02 0.44 0.00 126
3 2012-10-03 39.42 0.00 11352
4 2012-10-04 42.07 0.00 12116
5 2012-10-05 46.16 0.00 13294
6 2012-10-06 53.54 0.00 15420
7 2012-10-07 38.25 0.00 11015
8 2012-10-08 0
9 2012-10-09 44.48 0.00 12811
10 2012-10-10 34.38 0.00 9900
11 2012-10-11 35.78 0.00 10304
12 2012-10-12 60.35 0.00 17382
13 2012-10-13 43.15 0.00 12426
14 2012-10-14 52.42 0.00 15098
15 2012-10-15 35.20 0.00 10139
16 2012-10-16 52.38 0.00 15084
17 2012-10-17 46.71 0.00 13452
18 2012-10-18 34.92 0.00 10056
19 2012-10-19 41.07 0.00 11829
20 2012-10-20 36.09 0.00 10395
21 2012-10-21 30.63 0.00 8821
22 2012-10-22 46.74 0.00 13460
23 2012-10-23 30.97 0.00 8918
24 2012-10-24 29.01 0.00 8355
25 2012-10-25 8.65 0.00 2492
26 2012-10-26 23.53 0.00 6778
27 2012-10-27 35.14 0.00 10119
28 2012-10-28 39.78 0.00 11458
29 2012-10-29 17.42 0.00 5018
30 2012-10-30 34.09 0.00 9819
31 2012-10-31 53.52 0.00 15414
32 2012-11-01 0
33 2012-11-02 36.81 0.00 10600
34 2012-11-03 36.70 0.00 10571
35 2012-11-04 0
36 2012-11-05 36.25 0.00 10439
37 2012-11-06 28.94 0.00 8334
38 2012-11-07 44.73 0.00 12883
39 2012-11-08 11.18 0.00 3219
40 2012-11-09 0
41 2012-11-10 0
42 2012-11-11 43.78 0.00 12608
43 2012-11-12 37.38 0.00 10765
44 2012-11-13 25.47 0.00 7336
45 2012-11-14 0
46 2012-11-15 0.14 0.00 41
47 2012-11-16 18.89 0.00 5441
48 2012-11-17 49.79 0.00 14339
49 2012-11-18 52.47 0.00 15110
50 2012-11-19 30.70 0.00 8841
51 2012-11-20 15.53 0.00 4472
52 2012-11-21 44.40 0.00 12787
53 2012-11-22 70.93 0.00 20427
54 2012-11-23 73.59 0.00 21194
55 2012-11-24 50.27 0.00 14478
56 2012-11-25 41.09 0.00 11834
57 2012-11-26 38.76 0.00 11162
58 2012-11-27 47.38 0.00 13646
59 2012-11-28 35.36 0.00 10183
60 2012-11-29 24.47 0.00 7047
61 2012-11-30 0

4. Time Series of Daily activity pattern

plot(intervalStats$timeInterval, intervalStats$avgSteps, type="l", ylab="Avg Number of Steps", xlab="5-minute interval in time of day (24 hour)", main="Average steps per 5-min interval across all days")

5. The biggest 5-minute interval

highInterval <- format(intervalStats[intervalStats$avgSteps == max(intervalStats$avgSteps),5], format = "%H:%M AM")
highAvg <- round(intervalStats[intervalStats$avgSteps == max(intervalStats$avgSteps),2], 0)

The 5-minute interval that has the highest average number of steps starts at 08:35 AM with an average of 206 steps.

6. Missing data and imputing values

missingCount <- sum(is.na(activity[,1]))

There are 2304 missing data elements

# Determine how many missing steps there are for each day,
activity[which(is.na(activity[,1])),4] <- 1
print(xtable(group_by(activity, date) %>% summarize(missing = sum(V4,na.rm = TRUE)) %>% filter(missing >0)), type="html", auto=FALSE)
date missing
1 2012-10-01 288.00
2 2012-10-08 288.00
3 2012-11-01 288.00
4 2012-11-04 288.00
5 2012-11-09 288.00
6 2012-11-10 288.00
7 2012-11-14 288.00
8 2012-11-30 288.00

This shows the number of missing variables per day. You can see that no day has fewer than 288 missing values (there are 288 five minute increments in a 24-hour period). Therefore, if there is a missing value it is for the entire day. This could mean that the device was not worn on those days or there is some other day-related phenomenon. The strategy for handling these missing value could be to simply ignore those days. Alternatively (and because it is explicitly outlined in the assignment), we will impute values for those five minute increments each day.

This imputed values for each five-minute interval will be the median number of steps for that interval period across all days without NA

# merge the interval median values to the original activity dataset
i_activity <- merge(activity, intervalStats[,1:3], all=FALSE)

# replace observations where step is NA with the median for that interval...
i_activity[is.na(i_activity$steps),]$steps <- i_activity[is.na(i_activity$steps),]$medianSteps

# transform imputed data to summarize by day and interval...
i_dailyStats <- group_by(i_activity, date) %>% summarize(avgSteps = mean(steps,na.rm=TRUE), medianSteps = median(steps,na.rm=TRUE), totalSteps = sum(steps,na.rm=TRUE))

7. Histogram with imputed values

The histogram for imputed values looks nearly the same as the histogram on data without.

hist(i_dailyStats$totalSteps, xlab="Total Daily Steps", main="Histogram of Daily Total Steps")

i_avgStepsDaily <- round(mean(dailyStats$totalSteps), digits=0)
i_medianStepsDaily <- median(dailyStats$totalSteps)

Additionally, the median total steps taken each day is the same between the two datasets, at: 10395. This is expected since the imputed values are the medians. The average number of daily steps, however, works out to be slightly higher (by 150 steps, or 1.6% more) at: 9354

Imputing the missing data makes little difference to the statistics and estimated daily steps. My recommendation is to remove the missing days from the analysis where possible.

8 Weekdays vs. Weekends

# Add a weekend or weekday indicator to the activity data with imputed values...
i_activity$WE <- sapply(i_activity$date, function(x) if(weekdays(as.Date(x), abbreviate = TRUE) == "Sat" | weekdays(as.Date(x), abbreviate = TRUE) == "Sun") "Weekend" else "Weekday")

# group by weekend indicator and intervals...
i_intervalStats <- group_by(i_activity, WE, interval) %>% summarize(avgSteps = mean(steps,na.rm=TRUE), medianSteps = median(steps,na.rm=TRUE), totalSteps = sum(steps,na.rm=TRUE))

# two facet chart...
ggplot(i_intervalStats, aes(interval, avgSteps)) + geom_line(size=1.1, color="Blue") + geom_point(size=1, shape=21, color="DarkBlue", bg="LightBlue") + facet_grid(WE ~ .) + ggtitle("Average Steps per 5-min Interval") + theme_bw() + theme(strip.background = element_rect(fill="indianred"), panel.grid.minor.y = element_line(colour="pink"), panel.grid.major.y = element_line(colour="pink"), panel.grid.major.x = element_line(colour="pink"), panel.grid.minor.x = element_blank())

The End