## Set the working directory
setwd("~/R/Coursera/Data Science/Course 5/Assignment 1")
## Read the CSV file
activity <- read.csv("activity.csv")
## Describe the dataset
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## Convert date strings to date objects
activity$date <- as.Date(activity$date)
## Describe the dataset
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
To determine the mean total number of steps taken per day, first the mean number of steps per day has to be calculated. Then the results can be plotted.
## Aggregate steps for each date in dataframe
StepsPerDay <- aggregate(steps~date,activity,sum)
## Histogram of the total number of steps taken each day
hist(StepsPerDay$steps,xlab="Total steps",main="Total number of daily steps")
## Calculate and report the mean and median total number of steps taken per day
StepsPerDayMean <- mean(StepsPerDay$steps, na.rm=TRUE)
StepsPerDayMedian <- median(StepsPerDay$steps, na.rm=TRUE)
The mean number of steps taken per day is 1.076618910^{4}.
The median number of steps taken per day is 10765.
To determine the average daily activity pattern, first the mean number of steps per interval has to be calculated. Then the results can be plotted.
## Aggregate steps per interval and calculate the mean for each interval
StepsPerInterval <- aggregate(steps~interval,activity,mean)
## Rename column on average steps per interval dataset for merging
names(StepsPerInterval)[names(StepsPerInterval) == 'steps'] <- 'AverageSteps'
## Time series plot of 5-minute interval and average steps taken
plot(StepsPerInterval,type="l",xlab="Interval [5-min increments]",ylab="Average number of steps",main="Average number of steps per interval")
## Determine and report 5-minute interval containing max number of steps
MaxSteps <- StepsPerInterval$interval[which.max(StepsPerInterval$AverageSteps)]
The five-minute interval containing the maximum number of step is 835.
Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.
## Calculate and report total number of missing values in the dataset
MissVals <- sum(is.na(activity))
The total number of missing values in the dataset is 2304.
The strategy is not sophisticated. I used the mean number of steps for each 5-minute interval as the data to fill in for missing values. The mean number of steps for each 5-minute interval has already been calculated above in the “average daily activity pattern section.”
## Merge Activity and Average Steps Per Interval datasets for imputation
ImputedActivity = merge(activity, StepsPerInterval, by="interval")
## Impute missing steps in Activity with average steps per interval
ImputedActivity$steps[is.na(ImputedActivity$steps)] = ImputedActivity$AverageSteps[is.na(ImputedActivity$steps)]
Again, the total number of steps taken each day (imputed) needs to be calculated before it can be plotted.
## Aggregate steps per interval and calculate the mean for each interval on imputed data
AggregatedImputedActivity <- aggregate(steps~interval,ImputedActivity,sum)
## Histogram of the total number of steps taken each day (with imputed data)
hist(AggregatedImputedActivity$steps,xlab="Total steps",main="Total number of daily steps (Imputed)")
## Calculate and report the mean and median total number of steps taken per day (with imputed data)
ImputedStepsPerDayMean <- mean(AggregatedImputedActivity$steps, na.rm=TRUE)
ImputedStepsPerDayMedian <- median(AggregatedImputedActivity$steps, na.rm=TRUE)
The mean number of steps taken per day (imputed) is 2280.3385744.
The median number of steps taken per day (imputed) is 2080.9056604.
## Calculate difference between unimputed and imputed mean/median number of steps per day
DiffMean <- ImputedStepsPerDayMean - StepsPerDayMean
DiffMedian <- ImputedStepsPerDayMedian - StepsPerDayMedian
Filling in missing data with any non-missing data will have an effect on the estimates, regardless of the data used to fill in the missingness.
In this case the mean number of daily steps changed by -8485.8501048 steps, while the median number of daily steps changed by -8684.0943396 steps.
For this part the weekdays() function was used. I used the dataset with the filled-in (imputed) missing values for this part.
## Create function to classify the type of day as a factor
DayCategory <- function(date) {
if (weekdays(date) %in% c("Saturday", "Sunday")) {
"weekend"
} else {
"weekday"
}
}
## Apply DayCategory function to the date field in the ImputedActivity dataset
ImputedActivity$DayCat <- as.factor(sapply(ImputedActivity$date, DayCategory))
##Describe dataset
str(ImputedActivity)
## 'data.frame': 17568 obs. of 5 variables:
## $ interval : int 0 0 0 0 0 0 0 0 0 0 ...
## $ steps : num 1.72 0 0 0 0 ...
## $ date : Date, format: "2012-10-01" "2012-11-23" ...
## $ AverageSteps: num 1.72 1.72 1.72 1.72 1.72 ...
## $ DayCat : Factor w/ 2 levels "weekday","weekend": 1 1 2 1 2 1 2 1 1 2 ...
I used the lattice package to replicate example time series panel plot in the assignment materials. To determine the average daily activity pattern on the imputed data, first the mean number of steps per interval has to be calculated. Then the results can be plotted.
## Load the plyr package
library(plyr)
## Summarize data by interval and type of day
ImputedStepsPerIntervalDayCat <- ddply(ImputedActivity,.(interval,DayCat),summarize,AverageSteps=mean(steps))
## Load the lattice package
library(lattice)
## Time series plot of 5-minute interval and average steps taken (Imputed) by type of day
xyplot(AverageSteps~interval|DayCat,data=ImputedStepsPerIntervalDayCat,type="l",layout = c(1,2),
main="Average steps (imputed) per interval and type of day",
ylab="Average number of steps",xlab="Interval [5-min increments]")