By Jawad Rashid
data <- read.csv("activity.csv", stringsAsFactors = FALSE)
data$date <- as.Date(data$date)
hist(data$steps, xlab = "Total number of steps", main = "Total number of steps taken each day")
mean(data$steps, na.rm = TRUE)
## [1] 37.38
median(data$steps, na.rm = TRUE)
## [1] 0
averageStepsByInterval <- tapply(data$steps, data$interval, mean, na.rm = TRUE)
plot(names(averageStepsByInterval), averageStepsByInterval, type = "l", xlab = "5-minute Interval",
ylab = "Average number of steps taken", main = "Avg number of steps across 5-minute interval")
# Find the index and the maximum value
maxValues <- which.max(averageStepsByInterval)[1]
maxInterval <- names(which.max(averageStepsByInterval))
maxValue <- which.max(averageStepsByInterval)[[1]]
# Convert the interval from string to integer value
maxInterval <- strtoi(maxInterval, 10L)
maxInterval
## [1] 835
maxValue
## [1] 104
# Missing Values
missingValues <- is.na(data$steps)
table(missingValues)[[2]]
## [1] 2304
meanStepsByInterval <- tapply(data$steps, data$interval, mean, na.rm = TRUE)
naRows <- data[is.na(data), ]
for (i in 1:nrow(naRows)) {
naRows[i, 1] <- meanStepsByInterval[[toString(naRows[i, 3])]]
}
filledInData <- data
filledInData[is.na(data), 1] <- naRows[, 1]
hist(filledInData$steps, xlab = "Total number of steps", main = "Total number of steps taken each day")
mean(filledInData$steps, na.rm = TRUE)
## [1] 37.38
median(filledInData$steps, na.rm = TRUE)
## [1] 0
averageStepsByIntervalForFilledData <- tapply(filledInData$steps, filledInData$interval,
mean, na.rm = TRUE)
averageStepsByIntervalForFilledData <- tapply(filledInData$steps, filledInData$interval,
mean, na.rm = TRUE)
averageStepsByInterval[1:10]
## 0 5 10 15 20 25 30 35 40
## 1.71698 0.33962 0.13208 0.15094 0.07547 2.09434 0.52830 0.86792 0.00000
## 45
## 1.47170
averageStepsByIntervalForFilledData[1:10]
## 0 5 10 15 20 25 30 35 40
## 1.71698 0.33962 0.13208 0.15094 0.07547 2.09434 0.52830 0.86792 0.00000
## 45
## 1.47170
par(mfrow = c(1, 2))
hist(data$steps, xlab = "Total number of steps", main = "Missing Values")
hist(filledInData$steps, xlab = "Total number of steps", main = "Filled in Missing Values")
You can see the difference more visible with log of the frequency
par(mfrow = c(1, 2))
hist(log10(data$steps + 1), xlab = "Total number of steps", main = "Log of Missing Values")
hist(log10(filledInData$steps + 1), xlab = "Total number of steps", main = "Log of Filled in Values")
days <- weekdays(filledInData$date)
weekend <- (days == "Saturday" | days == "Sunday")
dayfactor <- factor(weekend, labels = list("weekday", "weekend"))
filledInData$daytype <- dayfactor
groupedData <- aggregate(filledInData$steps, list(DayType = filledInData$daytype,
Interval = filledInData$interval), mean)
library(lattice)
xyplot(groupedData$x ~ groupedData$Interval | groupedData$DayType, layout = c(1,
2), xlab = "Interval", ylab = "Number of Steps", type = "l")