Unzip and load the data (i.e. read.csv()):
untar(paste(getwd(),"activity.zip",sep="/"), compressed = 'gzip', exdir = getwd())
dat <- read.csv(paste(getwd(),"activity.csv",sep="/"), header = T, na.strings="NA")
str(dat)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
Transform the date column from data type factor into data type date for analysis:
dat$date <- as.Date(dat$date, "%Y-%m-%d")
str(dat)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
Make a histogram of the total number of steps taken each day:
datstepsum <- with(dat, aggregate(steps,by=list(Date=date), FUN=sum, na.rm=T))
library(ggplot2)
qplot(datstepsum[,2], data=datstepsum, binwidth=2000, main='Total Number of Steps Taken Each Day', xlab='steps per day')
Calculate the mean and median total number of steps taken per day:
mean(datstepsum[,2], na.rm=T)
## [1] 9354
median(datstepsum[,2], na.rm=T)
## [1] 10395
Make a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis):
datintmean <- with(dat, aggregate(steps, by=list(interval=interval), FUN=mean, na.rm=T))
qplot(datintmean[,1],datintmean[,2],data=datintmean, geom = "line", main="Daily Average Activity Pattern", ylab = "average number of steps", xlab= "interval in minutes")
The 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps and its number of steps:
maxint <- which.max(datintmean[,2])
datintmean[maxint,]
## interval x
## 104 835 206.2
Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs):
sum(!complete.cases(dat))
## [1] 2304
Filling in missing values in # of step in the dataset with the means for that 5-minute interval (calculated in the step above):
library(plyr)
datna <- join(dat, datintmean, by = 'interval')
suppressWarnings (datna$steps[is.na(datna[,1])==TRUE] <- datna[,4])
datna <- datna[,-4]
Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day:
datnastpsum <- with(datna, aggregate(steps,by=list(Date=date), FUN=sum, na.rm=T))
qplot(datnastpsum[,2], data=datnastpsum, binwidth=2000, main='Total Number of Steps Taken Each Day', xlab='steps per day')
Compute the mean and median again to determine the impact of imputing missing data on the estimates of the total daily number of steps:
mean(datnastpsum[,2], na.rm=T)
## [1] 10766
median(datnastpsum[,2], na.rm=T)
## [1] 10766
Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day:
datna$DateType<-weekdays(datna[,"date"])
for (i in 1:nrow(datna)){
if(datna[i,"DateType"] %in% c("Saturday","Sunday")){
datna[i,"DateType"] <- "Weekend"
} else {
datna[i,"DateType"] <- "Weekday"
}
}
Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis):
datnaint <- with(datna, aggregate(steps,by=list(DateType=DateType, Interval=interval), FUN=mean, na.rm=T))
qplot(Interval,x,data=datnaint, geom = "line", facets = .~DateType, main="Average Activity Pattern by Day Type (Weekday & Weekend)", ylab = "average number of steps", xlab= "interval in minutes")