setwd("~/RepData_PeerAssessment1")
activity <- read.csv("activity.csv", header=T)
activity$interval <- as.factor(activity$interval)
totstep <- tapply(activity$steps, activity$date, sum, na.rm=TRUE)
hist1 <- hist(totstep, main="Total number of steps taken each day", xlab="")
mean(totstep) ; median(totstep)
## [1] 9354.23
## [1] 10395
meanstep <- tapply(activity$steps, activity$interval,
mean, na.rm=TRUE)
interval <- as.integer(levels(activity$interval))
plot(interval, meanstep, type="l",
main="Time Series Plot ; interval",
xlab="5-minute Interval",
ylab="Average numbers of Steps")
which(meanstep==max(meanstep))
## 835
## 104
sum(is.na(activity$steps))
## [1] 2304
mean(activity$steps, na.rm=TRUE) ; median(activity$steps, na.rm=TRUE)
## [1] 37.3826
## [1] 0
newactivity <- activity
newactivity$steps[which(is.na(newactivity$steps))] <- median(newactivity$steps, na.rm=TRUE)
newtotstep <- tapply(newactivity$steps, newactivity$date, sum)
hist2 <- hist(newtotstep, main="Total number of steps taken each day ; Imputed", xlab="")
hist1$counts==hist2$counts
## [1] TRUE TRUE TRUE TRUE TRUE
totstep==newtotstep
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## TRUE TRUE TRUE TRUE TRUE TRUE
## 2012-11-30
## TRUE
In this dataset, if there’s at least one missing value in one day, that day’s data are ALL missing value. Therefore, imputing(median) doesn’t affect much to ‘daily’ step.
But it doesn’t mean that we should use mean instead of median, becuase mean is so big that it will do enormous upward leveling.
mean(totstep);median(totstep);mean(newtotstep);median(newtotstep)
## [1] 9354.23
## [1] 10395
## [1] 9354.23
## [1] 10395
date <- as.POSIXlt(newactivity$date)
a <- c() #empty vector
for(i in 1:nrow(newactivity)){
if(weekdays(date[i])=="Saturday" | weekdays(date[i])=="Saturday")
a[i] <- "weekend"
else
a[i] <- "weekday"
}
a <- as.factor(a)
newact_ind <- cbind(newactivity, a)
colnames(newact_ind)[ncol(newact_ind)] <- "Indicator"
# Split by weekend & weekday #
weekday_act <- newact_ind[which(newact_ind$Indicator=="weekday"),]
weekend_end <- newact_ind[which(newact_ind$Indicator=="weekend"),]
# Calculate each average step #
new_meanstep_day <- tapply(weekday_act$steps, weekday_act$interval, mean)
new_meanstep_end <- tapply(weekend_end$steps, weekend_end$interval, mean)
# Set interval #
interval <- as.integer(levels(newact_ind$interval))
par(mfrow=c(2,1))
plot(interval, new_meanstep_day, main="Average step across all weekdays",
ylab="Mean step", xlab="Interval", type="l")
plot(interval, new_meanstep_end, main="Average step across all weekends",
ylab="Mean step", xlab="Interval", type="l")