Here we load all the libraries that need to be loaded and set any global options.
library(ggplot2)
library(scales)
Here we unzip the file “activity.zip” and extract the data. Then we load that data into the data frame activity.data so we can use it.
if(!file.exists("activity.csv")) {unzip("activity.zip")}
# Read the data
activity.data <- read.csv("activity.csv", colClasses = c("numeric", "Date", "numeric"))
Here we calculate the mean total number of steps taken in a day across the entire time period. We aggregate the value of teps taken across the days and then take a mean.
steps.per.day <- with(activity.data, aggregate(steps, by = list(date), FUN = sum, na.rm = T))
names(steps.per.day) <- c("date", "steps")
histplot <- ggplot(steps.per.day, aes(x = steps, colour = "red", fill = "red", alpha = 0.2)) +
ggtitle("Histogram of daily steps") +
xlab("Steps (binwidth 2000)") +
ylab("Count") +
geom_histogram(binwidth = 2000) +
theme(legend.position = "none")
histplot
We calculate the mean of total steps taken over the time period, ignoring the missing values.
mean.steps <- mean(steps.per.day$steps, na.rm = T)
mean.steps
## [1] 9354.23
We also calculate the median of steps taken.
median.steps <- median(steps.per.day$steps, na.rm = T)
median.steps
## [1] 10395
As we can see, the mean of steps taken during any day is 9354.23 and the median is 10395.
We calculate the average daily activity using the intervals, so we can find out how an average day went down.
average.steps.per.interval <- with(activity.data, aggregate(steps, by = list(interval), FUN = mean, na.rm = T))
names(average.steps.per.interval) <- c("interval", "steps")
avg.steps <- ggplot(average.steps.per.interval, aes(interval, steps)) +
geom_line(colour = "blue") +
xlab("5-Minute Interval") +
ylab("Steps Taken") +
ggtitle("Time Series Plot of Average Steps by Interval") +
scale_x_discrete(breaks = pretty_breaks(10)) +
scale_y_continuous(breaks = pretty_breaks(10))
avg.steps
We also calculate the 5-minute interval with the highest value.
#The 5-min time interval contains the maximum number of steps?
max.interval <- average.steps.per.interval[which.max(average.steps.per.interval$steps),c("interval")]
max.interval
## [1] 835
The interval with the maximum number of steps in a single day, averaged across intervals is 835.
#total number of missing values in the dataset
na.total <- nrow(activity.data[is.na(activity.data$steps),])
The total number of missing values in our dataset is 2304 values.
We will fill them with average values for each interval using the average.steps.per.interval dataset we computed above.
#imputing missing step values with mean step at time interval
activity.imputed <- merge(x = activity.data, y = average.steps.per.interval, by = "interval", all.x = TRUE)
activity.imputed[is.na(activity.imputed$steps.x),c("steps.x")] <- activity.imputed[is.na(activity.imputed$steps.x),c("steps.y")]
#cleaning data
activity.imputed$steps <- activity.imputed$steps.x
activity.imputed$steps.x <- NULL
activity.imputed$steps.y <- NULL
#histogram with new dataframe
total.steps.per.day <- aggregate(x = activity.imputed$steps , by = list(activity.imputed$date), FUN = sum ,na.rm=TRUE)
names(total.steps.per.day) <- c("date","steps")
histplot.impute <- ggplot(total.steps.per.day,aes(x = steps, colour = "red", fill = "red", alpha = 0.2)) +
ggtitle("Histogram of daily steps after imputation") +
xlab("Steps (binwidth 2000)") +
ylab("Count") +
geom_histogram(binwidth = 2000) +
theme(legend.position = "none")
histplot.impute
Once again, we calculate the mean of total steps taken over the time period, ignoring the missing values.
mean.steps.impute <- mean(total.steps.per.day$steps, na.rm = T)
mean.steps.impute
## [1] 10766.19
We also calculate the median of steps taken.
median.steps.impute <- median(total.steps.per.day$steps, na.rm = T)
median.steps.impute
## [1] 10766.19
As we can see, the mean of steps taken during any day is 10766.19 and the median is 10766.19.
We will now add a separate factor, called weekday to activity.data to see if activity patterns change between weekdays and weekend.
activity.imputed$weekday <- as.factor(ifelse(weekdays(activity.imputed$date, abbreviate = F) %in% c("Saturday", "Sunday"), "Weekend", "Weekday"))
average.steps.per.interval.by.weekday <- aggregate(x = activity.imputed$steps,
by = list(activity.imputed$interval, activity.imputed$weekday),
FUN = mean, na.rm=TRUE)
names(average.steps.per.interval.by.weekday) <- c("interval","weekday","steps")
avg.step.line <- ggplot(average.steps.per.interval.by.weekday, aes(interval, steps)) +
xlab("5-Minute Interval") +
ylab("Steps Taken") +
ggtitle("Time Series Plot of Average Steps by Interval after Imputation, Factoring Weekends") +
facet_grid(. ~ weekday) +
geom_line(colour = "blue") +
scale_x_discrete(breaks = pretty_breaks(6)) +
scale_y_discrete(breaks = pretty_breaks(10))
avg.step.line