Reproducible Research: Peer Assessment 1


Loading and preprocessing the data

options(rpubs.upload.method = "internal")
setwd("~/GitHub/RepData_PeerAssessment1")
unzip("activity.zip")
data <- read.csv("activity.csv")

What is mean total number of steps taken per day?

data1 <- na.omit(data)
stepdays <- aggregate(data1["steps"], by = data1["date"], FUN = sum)

library(ggplot2)

plot1 <- ggplot(stepdays, aes(x = steps)) + geom_histogram(binwidth = 1000, color = "White", fill = "red") + 
        ggtitle("Total Steps Taken per Day") + theme(plot.title = element_text(face="bold")) + xlab("Steps Taken") + 
        ylab ("Count")

print(plot1)

plot of chunk histogramsteps

The mean total number of steps taken per day is 10766.
The median total number of steps taken per day is 10765.

What is the average daily activity pattern?

stepsinterval <- aggregate(data1["steps"], by = data1["interval"], FUN = mean)

plot2 <- ggplot(stepsinterval, aes(x = interval, y = steps)) + geom_line(color="red")+ 
        xlab("5-minute Interval") + ylab("Average Steps Taken") + ggtitle("Average Steps Taken by 5-minute Interval") +
        theme(plot.title = element_text(face="bold")) 

print(plot2)

plot of chunk timeseries

x <- which.max(stepsinterval$steps)

On average, across all days in the dataset, interval 835 contains the maximum number of steps with an average of 206 steps.

Imputing missing values

There are a total of 2304 missing values (NA) in the dataset.
The missing step values will be filled in with the mean value for that 5-minute interval.

dataNA <- data[apply(is.na(data), 1, any),]
dataNA$steps <- stepsinterval$steps[match(dataNA$interval, stepsinterval$interval)]
data2 <- rbind(dataNA, data1) 

stepdays2 <- aggregate(data2["steps"], by = data2["date"], FUN = sum)

plot3 <- ggplot(stepdays2, aes(x = steps)) + geom_histogram(binwidth = 1000,
        color = "White", fill = "blue") + ggtitle("Total Steps Taken per Day") + 
        theme(plot.title = element_text(face="bold")) +
        xlab("Steps Taken") + ylab ("Count")

print(plot3)

plot of chunk histogram2

The mean total number of steps taken per day is 10766.
The median total number of steps taken per day is 10766.
After filling in the missing values, the mean total number of steps remained the same, but the median increased by one.

Are there differences in activity patterns between weekdays and weekends?

data2$weekday1 <- weekdays(as.POSIXlt(data2$date))
for(i in 1:17568){
        if(data2$weekday1[i] == "Saturday" | data2$weekday1[i] == "Sunday"){
                data2$weekday[i]  <- "weekend"
        }
        else{
                data2$weekday[i] <- "weekday"
        }
}

wddata <- subset(data2[data2$weekday == "weekday", ])
wedata <- subset(data2[data2$weekday == "weekend", ])

wdinterval <- aggregate(wddata["steps"], by = wddata["interval"], FUN = mean)
weekday <- rep("weekday",288)
wdinterval <- cbind(wdinterval, weekday)
weinterval <- aggregate(wedata["steps"], by = wedata["interval"], FUN = mean)
weekday <- rep("weekend", 288)
weinterval <- cbind(weinterval, weekday)
stepsinterval2 <- rbind(wdinterval,weinterval)

plot4 <- ggplot(stepsinterval2, aes(x = interval, y = steps)) +
        geom_line(color="blue") + xlab("5-minute Interval") + ylab("Average Steps Taken") + 
        ggtitle("Average Steps Taken by 5-minute Interval") + theme(plot.title = element_text(face="bold")) + 
        facet_grid(weekday ~ .)

print(plot4)

plot of chunk weekdayplot