data <- read.csv("activity.csv", header = T)
# create a subset with NA removed
dataNaOmit <- subset(data, is.na(data$steps) == F)
calculating the total number of steps taken each day
library(plyr)
totalPerDay <- ddply(dataNaOmit, .(date), summarise, steps=sum(steps))
hist(totalPerDay$steps, breaks = 20, main="Number of Steps",
xlab="Total number of steps taken each day", ylab = "Number of Days", col="blue")
mean
mean(totalPerDay$steps)
## [1] 10766
median
median(totalPerDay$steps)
## [1] 10765
calcluating the average number of steps taken in each 5-minite intervals
averagePerInterval <- ddply(dataNaOmit, .(interval), summarise, steps=mean(steps))
plot(averagePerInterval$interval, averagePerInterval$steps,axes = F, type="l", col="blue", xlab="Time", ylab="Average Number of Steps",
main="Average Daily Activity Pattern")
axis(1,at=c(0,600,1200,1800,2400), label = c("0:00","6:00","12:00","18:00","24:00"))
axis(2)
averagePerInterval[which.max(averagePerInterval$steps),]
## interval steps
## 104 835 206.2
The result is 835. So it is the interval from 8:35 to 8:40
Imputing missing values ———————–
sum(is.na(data$steps))
## [1] 2304
| 2.Devise a strategy for filling in all of the missing values in the dataset. The strategy does not |
|---|
| need to be sophisticated. |
I will fill the NA with average value for that 5-min interval
imputed <- data
for (i in 1:nrow(imputed)){
if (is.na(imputed$steps[i])){
imputed$steps[i] <- averagePerInterval$steps[which(imputed$interval[i] == averagePerInterval$interval)]}
}
imputed <- arrange(imputed, interval)
Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day. Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?
totalPerDayImputed <- ddply(imputed, .(date), summarise, steps=sum(steps))
hist(totalPerDayImputed$steps, breaks = 20, main="Number of Steps", xlab="Total number of steps taken each day", ylab = "Number of Days", col="blue")
Calculate and report the mean and median total number of steps taken per day on the imputed dataset
mean(totalPerDayImputed$steps)
## [1] 10766
median(totalPerDayImputed$steps)
## [1] 10766
test does these values differ from thoes in the first part
abs(mean(totalPerDay$steps)-mean(totalPerDayImputed$steps))
## [1] 0
abs(median(totalPerDay$steps)- median(totalPerDayImputed$steps))/median(totalPerDay$steps)
## [1] 0.0001104
so the mean didn’t change after the imputing, the median slightly changed about 0.1% of the original value.
test how total steps taken per day differ
totalDifference <- sum(imputed$steps) - sum(dataNaOmit$steps)
totalDifference
## [1] 86130
Are there differences in activity patterns between weekdays and weekends?
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
imputed$weekdays <- weekdays(as.Date(imputed$date))
imputed$weekdays <- ifelse(imputed$weekdays %in% c("Saturday", "Sunday"),"weekend", "weekday")
2.Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).
average <- ddply(imputed, .(interval, weekdays), summarise, steps=mean(steps))
library(lattice)
xyplot(steps ~ interval | weekdays, data = average, layout = c(1, 2), type="l", xlab = "Interval", ylab = "Number of steps")