#Personal Movement Data Analysis

First, we load and process the data.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
activity <- read.csv("activity.csv",header=TRUE)
activity$date <- as.Date(activity$date)
activity <- mutate(activity,hour = interval %/% 100, minute = interval %% 100)

Next, we plot a histogram of the total number of steps taken per day and find the mean and median number of steps taken per day.

totalbyday <- aggregate(steps ~ date, activity, sum)
hist(totalbyday$steps,breaks=20,main = "Total Steps Each Day",
     col = "pink", xlab = "Number of Steps", xlim = c(0,25000))

mean(totalbyday$steps)
## [1] 10766.19
median(totalbyday$steps)
## [1] 10765

We then create a time series plot of average number of steps taken and find the time interval that had the maximum mean number of steps.

avgbyday <- aggregate(steps ~ interval, activity, mean)
plot(y=avgbyday$steps, x = avgbyday$interval, type="l",
     main = "Average Steps at Each Time Interval",
     xlab = "Time Interval",ylab = "Average Steps",
     xlim = c(0,2500))

avgbyday$interval[which.max(avgbyday$steps)]
## [1] 835

Next, we impute any missing values by replacing them with the mean of the 5 minute interval. Similar to before, we plot a histogram of total steps taken by day with the imputed values.

sum(is.na(activity$steps))
## [1] 2304
activity2 <- activity
activity2$steps[is.na(activity2$steps)] <- avgbyday$steps
totalbyday2 <- aggregate(steps ~ date, activity2, sum)
hist(totalbyday2$steps,breaks=20,main = "Total Steps Each Day (Incl. Imputed Values)",
     col = "pink", xlab = "Number of Steps", xlim = c(0,25000))

mean(totalbyday2$steps)
## [1] 10766.19
median(totalbyday2$steps)
## [1] 10766.19

As shown, the mean and median after imputing are the same. This differs from the non-imputed data - the mean is the same but the median changes.

Finally, we investigate any differences in activity patterns between weekdays and weekends.

activity2$day <- ifelse(weekdays(activity2$date) %in% c("Saturday","Sunday"), "weekend", "weekday")
avgbyday2 <- aggregate(steps ~ interval + day, activity2, mean)
par(mfrow=c(2,1))
ggplot(data=avgbyday2, aes(x=interval,y=steps, color = day)) +
  facet_grid(day~.)+ geom_line(color="purple", size=1)