setwd("e:\\module5")
activity <- read.csv("activity.csv", colClasses = c("numeric", "character", "numeric"))
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
names(activity)
## [1] "steps" "date" "interval"
activity$date <- as.Date(activity$date, "%Y-%m-%d")
StepsTotal <- aggregate(steps ~ date, data = activity, sum, na.rm = TRUE)
head(StepsTotal)
## date steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
hist(StepsTotal$steps, main = "Total steps by day", xlab = "day", col = "red")
The mean and median of the total number of steps taken per day respectively
mean(StepsTotal$steps); median(StepsTotal$steps)
## [1] 10766.19
## [1] 10765
Get mean of steps and plot time series.
time_series <- tapply(activity$steps, activity$interval, mean, na.rm = TRUE)
plot(row.names(time_series), time_series, type = "l", xlab = "5-min interval",
ylab = "Average across all Days", main = "Average number of steps taken",
col = "red")
max_interval <- which.max(time_series)
names(max_interval)
## [1] "835"
activity_NA <- sum(is.na(activity))
activity_NA
## [1] 2304
Replace NA with mean for that 5-minute interval
StepsAverage <- aggregate(steps ~ interval, data = activity, FUN = mean)
fillNA <- numeric()
for (i in 1:nrow(activity)) {
obs <- activity[i, ]
if (is.na(obs$steps)) {
steps <- subset(StepsAverage, interval == obs$interval)$steps
} else {
steps <- obs$steps
}
fillNA <- c(fillNA, steps)
}
new_activity <- activity
new_activity$steps <- fillNA
StepsTotal2 <- aggregate(steps ~ date, data = new_activity, sum, na.rm = TRUE)
hist(StepsTotal2$steps, main = "Total steps by day", xlab = "day", col = "red")
The mean and median of the total number of steps taken per day respectively
mean(StepsTotal2$steps); median(StepsTotal2$steps)
## [1] 10766.19
## [1] 10766.19
There are no difference in mean and only slight difference in median
For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.
day <- weekdays(activity$date)
daylevel <- vector()
for (i in 1:nrow(activity)) {
if (day[i] == "Saturday") {
daylevel[i] <- "Weekend"
} else if (day[i] == "Sunday") {
daylevel[i] <- "Weekend"
} else {
daylevel[i] <- "Weekday"
}
}
activity$daylevel <- daylevel
activity$daylevel <- factor(activity$daylevel)
stepsByDay <- aggregate(steps ~ interval + daylevel, data = activity, mean)
names(stepsByDay) <- c("interval", "daylevel", "steps")
library(lattice)
xyplot(steps ~ interval | daylevel, stepsByDay, type = "l", layout = c(1, 2), xlab = "Interval", ylab = "Number of steps")