This is Project 1 for the Reproducible Research course in the Coursera/JHU Certificate Program. It also is my first attempt to use the Markdown language.

First, I cleansed my R workspace and set my Working Directory:

rm(list = ls())
setwd("~/Desktop/JHU DS Certif/C5 Repro Research")

Q0: Loading and preprocessing the data

VARIABLES:

steps: Number of steps taking in a 5-minute interval (missing values are coded as NA)

date: The date on which the measurement was taken in YYYY-MM-DD format

interval: Identifier for the 5-minute interval in which measurement was taken

activity <- read.csv("~/Desktop/JHU DS Certif/C5 Repro Research/activity.csv", stringsAsFactors=FALSE)
activity$date <- as.Date(activity$date)
## str(activity)

Q1: What is mean total number of steps taken per day?

countSteps <- aggregate(steps ~ date, data = activity, sum) 
length(countSteps$steps)
## [1] 53
# 53
require(ggplot2)
## Loading required package: ggplot2
plot1 <- qplot(data = countSteps, steps, geom = "histogram")
plot1
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-3

meanWITHna <- mean(countSteps$steps, na.rm=T)
meanWITHna # 10766.19
## [1] 10766
medianWITHna <-median(countSteps$steps, na.rm=T)
medianWITHna # 10765
## [1] 10765

Leaving the NAs, the mean number steps were ‘r meanWITHna’, and median number steps were ‘r medianWITHna’.

Q2: What is the average daily activity pattern?

head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
stepsBYinterval <-aggregate(activity$steps, by = list(activity$interval), data = activity, FUN = "mean", na.rm=T)
stepsBYinterval$AveNumStepsPerInt <- stepsBYinterval$x
stepsBYinterval$Interval<- stepsBYinterval$Group.1
plot2 <- ggplot(data=stepsBYinterval, aes(x = Interval, y = AveNumStepsPerInt))
plot2 <- plot2 + geom_line()
plot2

plot of chunk unnamed-chunk-4

maxSteps <- aggregate(stepsBYinterval$Interval, by = list(stepsBYinterval$AveNumStepsPerInt), FUN= max)
numberSteps <- dim(maxSteps)[1]
maxSteps[numberSteps, ]
##     Group.1   x
## 254   206.2 835
# At interval 835 MAXIMUM average is reached with 206.1698 steps

Q3: Imputing missing values

rows.complete <- sum(complete.cases(activity))
rows.w.NA <- sum(!complete.cases(activity))
rows.all <- sum(complete.cases(activity)) + sum(!complete.cases(activity))
rows.w.NA  ## Total number of rows with NAs = 2304
## [1] 2304
plug <- median(stepsBYinterval$AveNumStepsPerInt)   ## Crude plug for NAs in 'steps' = 34.11
## New dataset w/o NAs
activity2 <- activity
activity2$steps[is.na(activity$steps)] <- plug
countSteps2 <- aggregate(steps ~ date, data = activity2, sum)
length(countSteps2$steps) # 61
## [1] 61
plot3 <- qplot(data = countSteps2, steps, geom = "histogram")
plot3
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

plot of chunk unnamed-chunk-5

meanWITHOUTna <- mean(countSteps$steps)
meanWITHOUTna # 10766.19 ~ NO CHANGE!!!
## [1] 10766
medianWITHOUTna <- median(countSteps$steps)
medianWITHOUTna # 10765 ~ NO CHANGE!!!
## [1] 10765
## Impact of imputing NAs ~ ZIP!!!
meanDelta <- (meanWITHna - meanWITHOUTna) / meanWITHna
meanDelta # 0
## [1] 0
medianDelta <- (medianWITHna - medianWITHOUTna) / medianWITHna
medianDelta # 0
## [1] 0

Q4: Are there differences in activity patterns between weekdays and weekends?

require(timeDate)
## Loading required package: timeDate
activity2$typeDay <- as.factor(isWeekend(activity2$date))
levels(activity2$typeDay) <- list(weekday = "FALSE", weekend = "TRUE")
summary(activity2$typeDay)
## weekday weekend 
##   12960    4608
# 12960 weekday 
#  4608 weekend 
# 17568 TOTAL observations
activity2wkday <- activity2[activity2$typeDay == 'weekday', ]
activity2wkend <- activity2[activity2$typeDay == 'weekend', ]
##
stepsBYinterval2wkday <-aggregate(activity2wkday$steps, by = list(activity2wkday$interval), data = activity2wkday, FUN = "mean")
stepsBYinterval2wkday$AveNumStepsPerInt <- stepsBYinterval2wkday$x
stepsBYinterval2wkday$Interval<- stepsBYinterval2wkday$Group.1
plot4 <- ggplot(data=stepsBYinterval2wkday, aes(x = Interval, y = AveNumStepsPerInt))
plot4 <- plot4 + geom_line() 
plot4 <- plot4 + ggtitle(paste("WEEKDAYS"))
##
stepsBYinterval2wkend <-aggregate(activity2wkend$steps, by = list(activity2wkend$interval), data = activity2wkend, FUN = "mean")
stepsBYinterval2wkend$AveNumStepsPerInt <- stepsBYinterval2wkend$x
stepsBYinterval2wkend$Interval<- stepsBYinterval2wkend$Group.1
plot5 <- ggplot(data=stepsBYinterval2wkend, aes(x = Interval, y = AveNumStepsPerInt))
plot5 <- plot5 + geom_line() 
plot5 <- plot5 + ggtitle(paste("WEEKENDS"))
require(gridExtra)
## Loading required package: gridExtra
## Loading required package: grid
grid.arrange(plot4, plot5, nrow = 2)

plot of chunk unnamed-chunk-6