options(rpubs.upload.method = "internal")
setwd("~/GitHub/RepData_PeerAssessment1")
unzip("activity.zip")
data <- read.csv("activity.csv")
data1 <- na.omit(data)
stepdays <- aggregate(data1["steps"], by = data1["date"], FUN = sum)
library(ggplot2)
plot1 <- ggplot(stepdays, aes(x = steps)) + geom_histogram(binwidth = 1000, color = "White", fill = "red") +
ggtitle("Total Steps Taken per Day") + theme(plot.title = element_text(face="bold")) + xlab("Steps Taken") +
ylab ("Count")
print(plot1)
The mean total number of steps taken per day is 10766.
The median total number of steps taken per day is 10765.
stepsinterval <- aggregate(data1["steps"], by = data1["interval"], FUN = mean)
plot2 <- ggplot(stepsinterval, aes(x = interval, y = steps)) + geom_line(color="red")+
xlab("5-minute Interval") + ylab("Average Steps Taken") + ggtitle("Average Steps Taken by 5-minute Interval") +
theme(plot.title = element_text(face="bold"))
print(plot2)
x <- which.max(stepsinterval$steps)
On average, across all days in the dataset, interval 835 contains the maximum number of steps with an average of 206 steps.
There are a total of 2304 missing values (NA) in the dataset.
The missing step values will be filled in with the mean value for that 5-minute interval.
dataNA <- data[apply(is.na(data), 1, any),]
dataNA$steps <- stepsinterval$steps[match(dataNA$interval, stepsinterval$interval)]
data2 <- rbind(dataNA, data1)
stepdays2 <- aggregate(data2["steps"], by = data2["date"], FUN = sum)
plot3 <- ggplot(stepdays2, aes(x = steps)) + geom_histogram(binwidth = 1000,
color = "White", fill = "blue") + ggtitle("Total Steps Taken per Day") +
theme(plot.title = element_text(face="bold")) +
xlab("Steps Taken") + ylab ("Count")
print(plot3)
The mean total number of steps taken per day is 10766.
The median total number of steps taken per day is 10766.
After filling in the missing values, the mean total number of steps remained the same, but the median increased by one.
data2$weekday1 <- weekdays(as.POSIXlt(data2$date))
for(i in 1:17568){
if(data2$weekday1[i] == "Saturday" | data2$weekday1[i] == "Sunday"){
data2$weekday[i] <- "weekend"
}
else{
data2$weekday[i] <- "weekday"
}
}
wddata <- subset(data2[data2$weekday == "weekday", ])
wedata <- subset(data2[data2$weekday == "weekend", ])
wdinterval <- aggregate(wddata["steps"], by = wddata["interval"], FUN = mean)
weekday <- rep("weekday",288)
wdinterval <- cbind(wdinterval, weekday)
weinterval <- aggregate(wedata["steps"], by = wedata["interval"], FUN = mean)
weekday <- rep("weekend", 288)
weinterval <- cbind(weinterval, weekday)
stepsinterval2 <- rbind(wdinterval,weinterval)
plot4 <- ggplot(stepsinterval2, aes(x = interval, y = steps)) +
geom_line(color="blue") + xlab("5-minute Interval") + ylab("Average Steps Taken") +
ggtitle("Average Steps Taken by 5-minute Interval") + theme(plot.title = element_text(face="bold")) +
facet_grid(weekday ~ .)
print(plot4)