Reading and Loading the data into R. This is the initial stage.
activity <- read.table(unz("activity.zip", "activity.csv"),header=T,sep=",")
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
Transform the data into format suitable for analysis. I shall transform the date column from factor into date:
activity$date <- as.Date(activity$date)
Ignoring the missing values in the data:
stepsPerDay <- aggregate(steps~date,data = activity, sum, na.rm=TRUE)
barplot(stepsPerDay$steps, names.arg = stepsPerDay$date, main = "Histogram of Number of Steps per day",xlab = "Day", ylab = "Steps")
MeanStepsPerDay <- round(mean(stepsPerDay$steps), 0)
MeanStepsPerDay
## [1] 10766
MedianStepsPerDay <- round(median(stepsPerDay$steps), 0)
MedianStepsPerDay
## [1] 10765
Mean of Steps per day is 1.076610^{4} and the Median steps per day is 1.076510^{4}.
(i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)
stepsPerInterval <- aggregate(steps~interval, data = activity, mean)
plot(stepsPerInterval, type = "l", main = "Time series plot of steps per Interval", col="blue")
MaxSteps <- max(stepsPerInterval$steps)
IntervalSteps <- subset(stepsPerInterval, stepsPerInterval$steps==MaxSteps)
y <- as.numeric(IntervalSteps$interval)
The 5-minute interval, that contains the maximum number of steps is 835
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
missingValues <- sum(is.na(activity))
Total Missing values is 2304
The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.
fillNA <- function(activity){
for(i in 1:length(activity$steps)){
if(is.na(activity$steps[i])){
activity$steps[i] <- stepsPerInterval$steps[match(activity$interval[i], stepsPerInterval$interval)]
}
}
activity
}
tidyActivity <- fillNA(activity)
#Now simply delete the extra column created above:
tidyActivity <- tidyActivity[,1:3]
dim(tidyActivity)
## [1] 17568 3
head(tidyActivity)
## steps date interval
## 1 1.7169811 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.0943396 2012-10-01 25
Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?
stepsPerDay2 <- aggregate(steps~date,data = tidyActivity, sum, na.rm=TRUE)
barplot(stepsPerDay2$steps, names.arg = stepsPerDay2$date, main = "Histogram of Number of Steps per day",xlab = "Day", ylab = "Steps")
tidyActivity <- tidyActivity[,1:3]
tidyActivity$DayOfWeek <- weekdays(tidyActivity$date)
for(i in 1:length(tidyActivity$DayOfWeek)){
if(tidyActivity$DayOfWeek[i] %in% c("Sunday","Saturday")){
tidyActivity$DayOfWeek[i] <- "Weekend"
}
}
for(j in 1:length((tidyActivity$DayOfWeek))){
if(tidyActivity$DayOfWeek[j] != "Weekend"){
tidyActivity$DayOfWeek[j] <- "Weekday"
}
}
tidyActivity$DayOfWeek <- factor(tidyActivity$DayOfWeek)
(i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).
library(lattice)
stepsPerDay4 <- aggregate(tidyActivity$steps, by = list(tidyActivity$interval, tidyActivity$DayOfWeek),mean)
names(stepsPerDay4) <- c("Interval", "DayOfWeek", "Steps")
xyplot(Steps ~ Interval | DayOfWeek, stepsPerDay4, type = "l", layout = c(1, 2),
xlab = "Interval", ylab = "Number of steps")