Here, we shall analyze the activity (number of steps taken during every 5-minute interval of the day) of an individual. We shall find out at what time of the day s/he is most active and try to find out the pattern of activity, if any. For convenience, we assume that the input-file is located in the same file-directory from where this Rmarkup file is being run.
f <- read.csv("activity.csv")
g <- aggregate(f$steps, list(f$date), sum, na.rm=FALSE)
colnames(g) <- c("day", "steps")
hist(g$steps, col = "RED", main = "Steps taken each day", xlab = "Number of Steps")
mn <- mean(g$steps, na.rm=TRUE)
md <- median(g$steps, na.rm=TRUE)
a <- aggregate(f$steps, list(f$interval), mean, na.rm=TRUE)
colnames(a) <- c("interval", "steps")
plot(a$interval, a$steps, type = "l", xlab = "Interval", ylab = "Average no. of steps", main = "Average no. of steps taken daily in 5 minutes intervals")
maxavg <- a[a$steps == max(a$steps),"interval"]
x <- nrow(f)
y <- nrow(f[!is.na(f$steps),])
e <- f
t <- which(is.na(f$steps))
for (i in 1:length(t)) {
e[t[i],"steps"] <- a[(a$interval == e[t[i],"interval"]),"steps"]
}
b <- aggregate(e$steps, list(e$date), sum, na.rm=FALSE)
## Keeping na.rm=FALSE here, to keep it syntactically same as 'g'-calculation, though it does not mean anything as 'e' does not contain any NA.
colnames(b) <- c("day_b", "steps_b")
mn_b <- mean(b$steps_b)
md_b <- median(b$steps_b)
hist(b$steps_b, col = "BLUE", main = "Steps taken each day", xlab = "Number of Steps")
day <- factor(c("weekday","weekend"))
e <- data.frame(day, e)
for (i in 1:nrow(e)) {
if ((weekdays(as.Date(e[i,"date"])) == "Saturday") | (weekdays(as.Date(e[i,"date"])) == "Sunday")) {
e[i,"day"] <- "weekend" }
else {
e[i,"day"] <- "weekday" }
}
library(lattice)
p <- aggregate(e$steps, list(e$day, e$interval),mean)
colnames(p) <- c("day", "interval", "steps")
xyplot(steps ~ interval | day, data = p, type = "l", xlab = "Interval", ylab = "Number of steps", layout = c(1,2))