Loading and preprocessing the data
data <- read.csv("activity.csv")
new_data <- data[complete.cases(data), ]
What is mean total number of steps taken per day?
k <- tapply(new_data$steps, new_data$date, sum)
l <- row.names(k)
hist(k)
tapply(new_data$steps, new_data$date, mean)
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## NA 0.4375 39.4167 42.0694 46.1597 53.5417
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 38.2465 NA 44.4826 34.3750 35.7778 60.3542
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 43.1458 52.4236 35.2049 52.3750 46.7083 34.9167
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 41.0729 36.0938 30.6285 46.7361 30.9653 29.0104
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 8.6528 23.5347 35.1354 39.7847 17.4236 34.0938
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 53.5208 NA 36.8056 36.7049 NA 36.2465
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 28.9375 44.7326 11.1771 NA NA 43.7778
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 37.3785 25.4722 NA 0.1424 18.8924 49.7882
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 52.4653 30.6979 15.5278 44.3993 70.9271 73.5903
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 50.2708 41.0903 38.7569 47.3819 35.3576 24.4688
## 2012-11-30
## NA
tapply(new_data$steps, new_data$date, median)
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## NA 0 0 0 0 0
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 0 NA 0 0 0 0
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 0 0 0 0 0 0
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 0 0 0 0 0 0
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 0 0 0 0 0 0
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 0 NA 0 0 NA 0
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 0 0 0 NA NA 0
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 0 0 NA 0 0 0
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 0 0 0 0 0 0
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 0 0 0 0 0 0
## 2012-11-30
## NA
What is the average daily activity pattern?
a <- tapply(new_data$steps, new_data$interval, mean)
b <- row.names(a)
plot(b, a, type = "l")
index <- which.max(array(a))
print(a[index])
## 835
## 206.2
Imputing missing values
nrow(data) - nrow(new_data)
## [1] 2304
n_data = data
for (i in 1:nrow(data)) {
if (is.na(data[i, 1])) {
n_data[i, 1] = a[[as.character(data[i, 3])]]
}
}
tapply(n_data$steps, n_data$date, mean)
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## 37.3826 0.4375 39.4167 42.0694 46.1597 53.5417
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 38.2465 37.3826 44.4826 34.3750 35.7778 60.3542
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 43.1458 52.4236 35.2049 52.3750 46.7083 34.9167
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 41.0729 36.0938 30.6285 46.7361 30.9653 29.0104
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 8.6528 23.5347 35.1354 39.7847 17.4236 34.0938
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 53.5208 37.3826 36.8056 36.7049 37.3826 36.2465
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 28.9375 44.7326 11.1771 37.3826 37.3826 43.7778
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 37.3785 25.4722 37.3826 0.1424 18.8924 49.7882
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 52.4653 30.6979 15.5278 44.3993 70.9271 73.5903
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 50.2708 41.0903 38.7569 47.3819 35.3576 24.4688
## 2012-11-30
## 37.3826
tapply(n_data$steps, n_data$date, median)
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06
## 34.11 0.00 0.00 0.00 0.00 0.00
## 2012-10-07 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12
## 0.00 34.11 0.00 0.00 0.00 0.00
## 2012-10-13 2012-10-14 2012-10-15 2012-10-16 2012-10-17 2012-10-18
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-10-19 2012-10-20 2012-10-21 2012-10-22 2012-10-23 2012-10-24
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-10-25 2012-10-26 2012-10-27 2012-10-28 2012-10-29 2012-10-30
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 2012-11-05
## 0.00 34.11 0.00 0.00 34.11 0.00
## 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 0.00 0.00 0.00 34.11 34.11 0.00
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17
## 0.00 0.00 34.11 0.00 0.00 0.00
## 2012-11-18 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-11-24 2012-11-25 2012-11-26 2012-11-27 2012-11-28 2012-11-29
## 0.00 0.00 0.00 0.00 0.00 0.00
## 2012-11-30
## 34.11
Are there differences in activity patterns between weekdays and weekends?
Sys.setlocale("LC_TIME", "English")
## [1] "English_United States.1252"
wd <- n_data$date
wd <- strptime(wd, "%Y-%m-%d")
wd <- weekdays(wd)
wd <- as.character(wd)
for (i in 1:length(wd)) {
if (wd[i] == "Saturday" | wd[i] == "Sunday") {
wd[i] <- "weekend"
} else {
wd[i] <- "weekday"
}
}
wd <- factor(wd)
n_data[, 4] = wd
colnames(n_data) <- c("steps", "date", "interval", "day")
a <- as.factor(tapply(n_data$steps, n_data$interval, mean))
a <- rep_len(a, 17568)
n_data[, 1] = a
library(lattice)
xyplot(n_data$steps ~ n_data$interval | n_data$day, data = n_data, type = "l",
layout = c(1, 2), xlab = "Interval", ylab = "Number of Steps")
Thus there is no differences in activity patterns between weekdays and weekends