data <- read.csv("activity.csv", header = T)
dataNaOmit <- subset(data, is.na(data$steps) == F)
calculating the total number of steps taken each day
library(plyr)
totalPerDay <- ddply(dataNaOmit, .(date), summarise, steps=sum(steps))
creating the plot
hist(totalPerDay$steps, breaks = 20, main="Number of Steps",
xlab="Total number of steps taken each day", ylab = "Number of Days", col="red")
mean
mean(totalPerDay$steps)
## [1] 10766
median
median(totalPerDay$steps)
## [1] 10765
calcluating the average number of steps taken in each 5-minite intervals
averagePerInterval <- ddply(dataNaOmit, .(interval), summarise, steps=mean(steps))
creating the plot
plot(averagePerInterval$interval, averagePerInterval$steps,axes = F, type="l", col="green", xlab="Time", ylab="Average Number of Steps",
main="Average Daily Activity Pattern")
axis(1,at=c(0,600,1200,1800,2400), label = c("0:00","6:00","12:00","18:00","24:00"))
axis(2)
averagePerInterval[which.max(averagePerInterval$steps),]
## interval steps
## 104 835 206.2
So it is the interval from 8:35 to 8:40
sum(is.na(data$steps))
## [1] 2304
I will fill the NA with average value for that 5-min interval
imputed <- data
for (i in 1:nrow(imputed)){
if (is.na(imputed$steps[i])){
imputed$steps[i] <- averagePerInterval$steps[which(imputed$interval[i] == averagePerInterval$interval)]}
}
imputed <- arrange(imputed, interval)
calculating the total number of steps taken each day
totalPerDayImputed <- ddply(imputed, .(date), summarise, steps=sum(steps))
creating the plot
hist(totalPerDayImputed$steps, breaks = 20, main="Number of Steps", xlab="Total number of steps taken each day", ylab = "Number of Days", col="blue")
Calculate and report the mean and median total number of steps taken per day on the imputed dataset
mean(totalPerDayImputed$steps)
## [1] 10766
median(totalPerDayImputed$steps)
## [1] 10766
test does these values differ from thoes in the first part
abs(mean(totalPerDay$steps)-mean(totalPerDayImputed$steps))
## [1] 0
abs(median(totalPerDay$steps)- median(totalPerDayImputed$steps))/median(totalPerDay$steps)
## [1] 0.0001104
so the mean didn’t change after the imputing, the median slightly changed about 0.1% of the original value.
test how total steps taken per day differ
totalDifference <- sum(imputed$steps) - sum(dataNaOmit$steps)
totalDifference
## [1] 86130
Impute the dataset cause the estimation on total steps per day to increase
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
imputed$weekdays <- weekdays(as.Date(imputed$date))
imputed$weekdays <- ifelse(imputed$weekdays %in% c("Saturday", "Sunday"),"weekend", "weekday")
calcluating the average for each interval
average <- ddply(imputed, .(interval, weekdays), summarise, steps=mean(steps))
creating the plot
library(lattice)
xyplot(steps ~ interval | weekdays, data = average, layout = c(1, 2), type="l", xlab = "Interval", ylab = "Number of steps")