#Personal Movement Data Analysis
First, we load and process the data.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
activity <- read.csv("activity.csv",header=TRUE)
activity$date <- as.Date(activity$date)
activity <- mutate(activity,hour = interval %/% 100, minute = interval %% 100)
Next, we plot a histogram of the total number of steps taken per day and find the mean and median number of steps taken per day.
totalbyday <- aggregate(steps ~ date, activity, sum)
hist(totalbyday$steps,breaks=20,main = "Total Steps Each Day",
col = "pink", xlab = "Number of Steps", xlim = c(0,25000))
mean(totalbyday$steps)
## [1] 10766.19
median(totalbyday$steps)
## [1] 10765
We then create a time series plot of average number of steps taken and find the time interval that had the maximum mean number of steps.
avgbyday <- aggregate(steps ~ interval, activity, mean)
plot(y=avgbyday$steps, x = avgbyday$interval, type="l",
main = "Average Steps at Each Time Interval",
xlab = "Time Interval",ylab = "Average Steps",
xlim = c(0,2500))
avgbyday$interval[which.max(avgbyday$steps)]
## [1] 835
Next, we impute any missing values by replacing them with the mean of the 5 minute interval. Similar to before, we plot a histogram of total steps taken by day with the imputed values.
sum(is.na(activity$steps))
## [1] 2304
activity2 <- activity
activity2$steps[is.na(activity2$steps)] <- avgbyday$steps
totalbyday2 <- aggregate(steps ~ date, activity2, sum)
hist(totalbyday2$steps,breaks=20,main = "Total Steps Each Day (Incl. Imputed Values)",
col = "pink", xlab = "Number of Steps", xlim = c(0,25000))
mean(totalbyday2$steps)
## [1] 10766.19
median(totalbyday2$steps)
## [1] 10766.19
As shown, the mean and median after imputing are the same. This differs from the non-imputed data - the mean is the same but the median changes.
Finally, we investigate any differences in activity patterns between weekdays and weekends.
activity2$day <- ifelse(weekdays(activity2$date) %in% c("Saturday","Sunday"), "weekend", "weekday")
avgbyday2 <- aggregate(steps ~ interval + day, activity2, mean)
par(mfrow=c(2,1))
ggplot(data=avgbyday2, aes(x=interval,y=steps, color = day)) +
facet_grid(day~.)+ geom_line(color="purple", size=1)