This project makes use of data from a personal activity monitoring device. The device collects data at 5 minute intervals through out the day. The data consists of two months of data from an anonymous individual collected during the months of October and November, 2012 and include the number of steps taken in 5 minute intervals each day.
Basic analysis of the data is completed in R and detailed below.
library(ggplot2)
library(plyr)
measured_data <- read.csv("C:/Users/Adam/Documents/School/Coursera/Reproducable research/activity.csv")
measured_data$date <- as.Date(measured_data$date, )
na_index <- !is.na(measured_data$steps)
subset_data <- measured_data[na_index,]
calculating the total number of steps taken per day
summarystat <- aggregate(subset_data$steps, by = list(date=subset_data$date), sum)
A histogram of total number of steps taken per day
qplot(x = summarystat$date, y = summarystat$x, geom = "bar", stat = "identity", xlab = "date", ylab = "count")
The mean and median number of steps taken per day
mean(summarystat$x)
## [1] 10766.19
median(summarystat$x)
## [1] 10765
A time series plot of interval vs average number of steps taken averaged across all days
mean_steps_inter <- aggregate(subset_data$steps, by = list(interval=subset_data$interval), mean)
qplot(x = mean_steps_inter$interval, y = mean_steps_inter$x, geom = "line", xlab = "interval", ylab = "count")
The interval and number of steps that contains the maximum number of steps is as follows:
mean_steps_inter[which(mean_steps_inter$x == max(mean_steps_inter$x)),]
## interval x
## 104 835 206.1698
calculating and reporting the total number of missing values in the dataset
sum(is.na(measured_data$steps))
## [1] 2304
filling in missing data - 8 days of data are missing. Values to be replaced with mean.
for (i in 1:length(measured_data$steps)){
if (is.na(measured_data$steps[i]) == TRUE) {
measured_data$steps[i] <- mean_steps_inter$x[which(measured_data$interval[i] == mean_steps_inter$interval)]
}
}
Making a histogram with replaced data and calculating mean and median
summarystat <- aggregate(measured_data$steps, by = list(date=measured_data$date), sum)
qplot(x = summarystat$date, y = summarystat$x, geom = "bar", stat = "identity", xlab = "date", ylab = "count")
mean(summarystat$x)
## [1] 10766.19
median(summarystat$x)
## [1] 10766.19
Creating a new factor variable with “weekday and”weekend"
weekday <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
weekday_vec <- NULL
for (i in 1:length(measured_data$date)) {
if (weekdays(measured_data$date[i]) %in% weekday) {
weekday_vec[i] <- "Weekday"
}
else{
weekday_vec[i] <- "Weekend"
}
}
measured_data <- cbind(measured_data, weekday_vec)
summary <- ddply(measured_data, .(weekday_vec,interval), function(measured_data) mean(measured_data$steps, na.rm = TRUE))
Creating a panel time series plot for weekend and weekday
q <- ggplot(summary, aes(x = interval, y= V1))
q + geom_line() + facet_grid(.~weekday_vec)