Reproducible Research - Project 1

Xiao Yu

Examining data from a personal activity monitoring device

==============================================================

Loading and Preprocessing the Data

## load packages and set work directory
setwd("/Users/yxshelly/Desktop/data_science/c5_reproducible research")
library(dplyr); library(plyr); library(ggplot2)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
setwd("/Users/yxshelly/Desktop/data_science/c5_reproducible research")
if(!file.exists("data")) {
        dir.create("data")
}
## download data from the website to "data"
path <- getwd()
download.file(url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip",
              destfile = paste(path, "data/files.zip", sep = "/"))
list.files("./data")
unzip(zipfile = "data/files.zip")
# document the date downloaded
dataDownloaded <- date()

load the data and briefly examine

activity <-read.csv("activity.csv")
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

===========================================================================

What is mean total number of steps taken per day?

1. Calculate the total number of steps taken per day

totalSteps <- ddply(activity, "date", summarise, tot=sum(steps), na.rm=T)
head(totalSteps)
##         date   tot na.rm
## 1 2012-10-01    NA  TRUE
## 2 2012-10-02   126  TRUE
## 3 2012-10-03 11352  TRUE
## 4 2012-10-04 12116  TRUE
## 5 2012-10-05 13294  TRUE
## 6 2012-10-06 15420  TRUE

2. Make a histogram of the total number of steps taken each day

## Warning: Removed 8 rows containing non-finite values (stat_bin).
ggplot(totalSteps, aes(x=tot)) +
        geom_histogram(fill= "red", binwidth=1000) +
        labs(title = "Histogram of Total Number of Daily Steps", x = "Steps", y = "Frequency")
## Warning: Removed 8 rows containing non-finite values (stat_bin).

3. Calculate and report the mean and median of the total number of steps taken per day

summary(totalSteps$tot)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##      41    8841   10765   10766   13294   21194       8

===========================================================================

What is the average daily activity pattern?

1. Make a time series plot of the 5-minute interval (x-axis)

# recode interval to factor variable 
activity$interval <- factor(activity$interval)

# calculate average steps per interval across days
avePattern <- ddply(activity, "interval", summarise, average=mean(steps, na.rm=T))
head(avePattern, 10)
##    interval   average
## 1         0 1.7169811
## 2         5 0.3396226
## 3        10 0.1320755
## 4        15 0.1509434
## 5        20 0.0754717
## 6        25 2.0943396
## 7        30 0.5283019
## 8        35 0.8679245
## 9        40 0.0000000
## 10       45 1.4716981
str(avePattern)
## 'data.frame':    288 obs. of  2 variables:
##  $ interval: Factor w/ 288 levels "0","5","10","15",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ average : num  1.717 0.3396 0.1321 0.1509 0.0755 ...
plot(avePattern$interval, avePattern$average, 
     xlab="interval", ylab="average of steps", type="l",
     main="Average Daily Activity Pattern")
lines(avePattern$average)

2. Which 5-minute interval contains the maximum number of steps?

max <- arrange(avePattern, average, decreasing = TRUE)
head(max)
##   interval  average
## 1      835 206.1698
## 2      840 195.9245
## 3      850 183.3962
## 4      845 179.5660
## 5      830 177.3019
## 6      820 171.1509

===========================================================================

Imputing missing values

1. Calculate and report the total number of missing values in the dataset

summary(activity$steps)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00   37.38   12.00  806.00    2304

2. Impute missing values using the median

# Check the column names containing missing observations 
list_na <- colnames(activity)[apply(activity, 2, anyNA)]
list_na
## [1] "steps"
# Replace missing values with the mean
activity_impute_median <- data.frame(
        sapply(activity,
                function(x) ifelse(is.na(x),
                                   median(x, na.rm = TRUE),
                                   x)))

3. Create a new dataset

summary(activity_impute_median$steps)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    0.00   32.48    0.00  806.00

4. Histogram of the total number of steps using imputed data

totalSteps2 <- ddply(activity_impute_median, "date", summarise, tot=sum(steps))
head(totalSteps2)
##   date   tot
## 1    1     0
## 2    2   126
## 3    3 11352
## 4    4 12116
## 5    5 13294
## 6    6 15420
ggplot(totalSteps2, aes(x=tot)) +
        geom_histogram(fill= "blue", binwidth=1000) +
        labs(title = "Histogram of Total Number of Daily Steps, Imputed", x = "Steps", y = "Frequency")

summary(totalSteps2$tot)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0    6778   10395    9354   12811   21194

with NA data, mean and median are 10766 and 10755

with imputed data (filled with median), the new mean and median are 9354 and 10395

===========================================================================

Are there differences between weekdays and weekends?

1. Create a new factor variable indicating whether a given date is a weekday or weekend day.

# change date to date format
activity_impute_median$date <- as.Date(activity$date, format = "%Y-%m-%d")
activity_impute_median$day <-weekdays(activity_impute_median$date)

# binary indicator
activity_impute_median$day <-gsub("Monday|Tuesday|Wednesday|Thursday|Friday", "weekday", activity_impute_median$day) 
activity_impute_median$day <-gsub("Saturday|Sunday","weekend", activity_impute_median$day)  
table(activity_impute_median$day)
## 
## weekday weekend 
##   12960    4608
str(activity_impute_median)
## 'data.frame':    17568 obs. of  4 variables:
##  $ steps   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: num  1 2 3 4 5 6 7 8 9 10 ...
##  $ day     : chr  "weekday" "weekday" "weekday" "weekday" ...

2. Figure of daily pattern by weekday/weekend

activity_impute_median$interval <- factor(activity_impute_median$interval)
activity_impute_median$day <- factor(activity_impute_median$day)

# summarize by weekday/weekend
act1 <- filter(activity_impute_median, activity_impute_median$day=="weekday")
aveP1 <- ddply(act1, "interval", summarise, average=mean(steps))
head(aveP1, 10)
##    interval    average
## 1         1 2.02222222
## 2         2 0.40000000
## 3         3 0.15555556
## 4         4 0.17777778
## 5         5 0.08888889
## 6         6 1.31111111
## 7         7 0.62222222
## 8         8 1.02222222
## 9         9 0.00000000
## 10       10 1.60000000
act2 <- filter(activity_impute_median, activity_impute_median$day=="weekend")
aveP2 <- ddply(act2, "interval", summarise, average=mean(steps))
head(aveP1, 10)
##    interval    average
## 1         1 2.02222222
## 2         2 0.40000000
## 3         3 0.15555556
## 4         4 0.17777778
## 5         5 0.08888889
## 6         6 1.31111111
## 7         7 0.62222222
## 8         8 1.02222222
## 9         9 0.00000000
## 10       10 1.60000000
par(mfrow=c(2, 1))
plot(aveP1$interval, aveP1$average, 
     xlab="interval", ylab="average of steps", type="l",
     main="Average Daily Activity Pattern - Weekdays")
lines(aveP1$average, col="blue")

plot(aveP2$interval, aveP2$average, 
     xlab="interval", ylab="average of steps", type="l",
     main="Average Daily Activity Pattern - Weekends")
lines(aveP2$average, col="red")