set the default values for echo
knitr::opts_chunk$set(echo = TRUE)
First we download the data
url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(url, destfile = "activity")
Read the data into R
activity = read.csv("./activity.csv")
What is the mean total number of steps taken per day?
first, remove the missing values
activity_complete = activity [complete.cases(activity),]
total_steps = aggregate(activity_complete$steps, by = list(activity_complete$date), sum)
colnames(total_steps) = c("date", "steps")
head(total_steps)
## date steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
library(ggplot2)
ggplot(total_steps, aes(as.factor(date),steps))+geom_bar(fill="blue", stat="identity")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("date") + ylab("total steps") + ggtitle("total steps per day")
mean_steps = mean(activity_complete$steps)
median_steps = median(activity_complete$steps)
The mean and median total number of steps taken per day are 37.3825996 and 0 respectively.
What is the average daily activity pattern?
first we find the average number of steps aggregated by interval
activity_complete = aggregate(steps ~ interval, activity_complete, FUN = mean)
make the plot
ggplot(activity_complete, aes(x = interval, y = steps )) + geom_line(col = "red") + xlab("interval") + ylab("steps") + ggtitle("time series plot of Average number of \nsteps against interval ")
most_steps = subset(activity_complete, activity_complete$steps == max(activity_complete$steps))[1,1]
The 5-minute interval with the highest number of steps is 835.
Imputing missing values
total = nrow(activity) - nrow(activity [complete.cases(activity),])
There are 2304 rows with NA values
Devise a strategy for filling in all of the missing values in the dataset. The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.
Create a new dataset that is equal to the original dataset but with the missing data filled in.
nas_replaced = activity
nas_replaced[is.na(nas_replaced$steps),][1] = median(nas_replaced$steps, na.rm = T)
The new dataset is called nas_replaced
first we find the total number of steps per day
new_total_steps = aggregate(nas_replaced$steps, by = list(nas_replaced$date), sum)
colnames(new_total_steps) = c("date", "steps")
now make a histogram
ggplot(new_total_steps, aes(as.factor(date),steps))+geom_bar(fill="blue", stat="identity")+ theme(axis.text.x = element_text(angle = 90, hjust = 1)) + xlab("date") + ylab("total steps") + ggtitle("total steps per day")
calculate the mean and median steps taken per day where NA values were replaced by median
new_mean = mean(nas_replaced$steps)
new_median = median(nas_replaced$steps)
The mean with NAs removed is 37.3825996 and 32.4799636 when replaced with median. Similarly, the median is 0 and 0 respectively.
The mean is therefore greater by 4.902636 when NAs are removed. The median however, remains the same at 0.
Are there differences in activity patterns between weekdays and weekends?
lets convert the date variable to a date format
nas_replaced$date = as.Date(nas_replaced$date)
using chron package, let us check if date is weekend then add a new variable to our dataframe showing this values
library(chron)
nas_replaced$day = chron::is.weekend(nas_replaced$date)
for (i in 1:nrow(nas_replaced)) {
if (nas_replaced$day [i] == T) {
nas_replaced$day [i] = "weekend"
} else {
nas_replaced$day [i] = "weekday"
}
}
first we find the average number of steps aggregated by interval
nas_replaced = aggregate(steps ~ interval + day, nas_replaced, FUN = mean)
make the plot
ggplot(nas_replaced, aes(x = interval, y = steps, color=day)) + geom_line( ) + xlab("interval") + ylab("steps") + ggtitle("panel plot of Average number of \nsteps against interval for \nweekday and weekend") +
facet_wrap(~ day, ncol=1)
Based on the plot above, the pattern of activity remains generally unchanged during the weekdays and weekend.