Loading and processing the data
# Load the necessary packages
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
# Read and clean the data
data<-read.csv("activity.csv")
data_clean<-data[!is.na(data$steps),]
head(data_clean,5)
steps date interval
289 0 02/10/2012 0 290 0 02/10/2012 5 291 0 02/10/2012 10 292 0 02/10/2012 15 293 0 02/10/2012 20
What is the mean total number of steps taken per day?
For this part of the assignment, you can ignore the missing values in the dataset
# Group the data by day
daily <- group_by(data_clean, date)
total_steps_by_day <- summarize(daily, total=sum(steps))
# Plot the histogram
hist(total_steps_by_day$total, main="Histogram of total number of steps per day", xlab="Total number of steps in a day")
# Get summary data
summary(total_steps_by_day)
date total
02/10/2012: 1 Min. : 41
02/11/2012: 1 1st Qu.: 8841
03/10/2012: 1 Median :10765
03/11/2012: 1 Mean :10766
04/10/2012: 1 3rd Qu.:13294
05/10/2012: 1 Max. :21194
(Other) :47
summarize(total_steps_by_day, mean=mean(total_steps_by_day$total), median=median(total_steps_by_day$total))
mean median
<dbl> <int>
1 10766.19 10765
Answer: The mean is 10766 and the median is 10765.
What is the average daily activity pattern?
steps_in_interval<- aggregate(steps ~ interval, data_clean, mean)
# Draw a time series plot
with(steps_in_interval,(plot(interval, steps, type="l", main="Average number of steps over all days",xlab="Inteval", ylab="Average number of steps")))
NULL
# Find the row with the max number of steps
max_row <- which.max(steps_in_interval$steps)
# Find the inverval with this max
steps_in_interval[max_row, ]
interval steps
104 835 206.1698
Answer: Interval 835 has the maximum average value of steps
Inputting missing values
There are a number of day/intervals values in the dataset (coded as NA). The presence of missing day may introduce bias into some calculations or summaries of the data.
NA Strategy: The missing data strategy that I will use is to substitute average daily steps for the missing data.
mean_steps<-summarize(daily, mean=mean(steps))
# Intialise the data to the orginal data
data_filled<-data
# Fill NAs in with mean data
for(i in 1:nrow(data_filled)){if(is.na(data_filled$steps[i])) data_filled$steps[i]<-mean_steps$mean[data_filled$date[i]]}
# Now reprocess the data with the filled values
daily_filled <- group_by(data_filled, date)
total_steps_by_day <- summarize(daily_filled, total=sum(steps))
# Draw the new histogram
hist(total_steps_by_day$total, main="Histogram of total number of steps per day", xlab="Total number of steps in a day (filled data)")
# Get summary statistics for comparison
summary(total_steps_by_day)
date total
01/10/2012: 1 Min. : 41
01/11/2012: 1 1st Qu.: 8836
02/10/2012: 1 Median :10600
02/11/2012: 1 Mean :10587
03/10/2012: 1 3rd Qu.:12986
03/11/2012: 1 Max. :21194
(Other) :55 NA’s :1
Answer: The new mean is smaller than the old one (10587 versus 10766) and the new median is slightly smaller than the old one (10600 versus 10765).
Are there differences in activity patterns between weekdays and weekends?
For this part the weekdays() function may be of some help. Use the dataset with the filled-in missing values for this part.
Create a new factor variable in the dataset with two levels - “weekday” and “weekend” indicating whether a given date is a weekday or a weekend day.
Make a panel plot containing a time series plot (i.e. type=“l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekdays or weekend days (y-axis).
# Create the new factor using the filled data
data_filled['day_type']<-weekdays(as.Date(data_filled$date))
data_filled$day_type[data_filled$day_type %in% c('Saturday', 'Sunday')]<-"weekend"
data_filled$day_type[data_filled$day_type != "weekend"]<-"weekday"
data_filled$day_type<-as.factor(data_filled$day_type)
head(data_filled,5)
steps date interval day_type 1 0.4375 01/10/2012 0 weekend 2 0.4375 01/10/2012 5 weekend 3 0.4375 01/10/2012 10 weekend 4 0.4375 01/10/2012 15 weekend 5 0.4375 01/10/2012 20 weekend
# Recalculate steps in interval using filled data and day type
filled_steps_in_interval<-aggregate(steps ~ interval + day_type, data_filled, mean)
# Create a comparison plot for weekdays and weekends
ggplot(filled_steps_in_interval, aes(interval, steps))+geom_line()+facet_grid(day_type ~.) + xlab("5-minute interval") + ylab("Number of steps")