activity <- read.csv("activity.csv")
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
activity <- transform(activity, date=as.Date(date))
activity_day_sum <- aggregate(activity[1], by= list((activity$date)),sum, na.rm=T)
hist(activity_day_sum$steps,col="red", xlab="Total number of steps taken each day")
mean_day_sum <- mean(activity_day_sum$steps)
median_day_sum <- median(activity_day_sum$steps)
The mean of the total steps per day is 9354.2295082 , while the median is 10395.
activity_interval <- aggregate(activity[1],by=list(activity$interval),mean,na.rm=T)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.4
ggplot(activity_interval, aes(Group.1,steps))+geom_line()
interval_max <- activity_interval[which.max(activity_interval$steps),][1]
steps_max_interval <- activity_interval[which.max(activity_interval$steps),][2]
On average across all days the interval 835 contains the maximum number of steps i.e. 206.1698113.
number_na <- sum(is.na(activity$steps))
percent_na <- number_na/nrow(activity)
Total number of missing values in the dataset is 2304, which is 0.1311475 as a proportion of the total.
activity_imput <- activity
for (i in seq_along(activity_imput[,1])) {
if (is.na(activity_imput[i,1])) {
activity_imput[i,1] <- activity_interval[activity_interval[,1] %in% activity_imput[i,3],][2] }
}
activity_imput_daysum <- aggregate(activity_imput[1], by=list(activity_imput$date),sum)
hist(activity_imput_daysum$steps, col="blue", xlab="Total number of steps taken each day with imputed missing values")
mean_dayim_sum <- mean(activity_imput_daysum$steps)
median_dayim_sum <- median(activity_imput_daysum$steps)
The mean of the total number of steps taken each day with imputed values is 1.076618910^{4}, while the median is 1.076618910^{4}.
weekdays <- as.factor(grepl("Saturday|Sunday", weekdays(activity_imput$date)))
levels(weekdays) <- c("weekday","weekend")
activity_imput$weekdays <- weekdays
str(activity_imput)
## 'data.frame': 17568 obs. of 4 variables:
## $ steps : num 1.717 0.3396 0.1321 0.1509 0.0755 ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## $ weekdays: Factor w/ 2 levels "weekday","weekend": 1 1 1 1 1 1 1 1 1 1 ...
activity_interval_w <- aggregate(activity_imput[1],by=list(activity_imput$interval,activity_imput$weekdays),mean)
names(activity_interval_w)[1:2] <- c("interval","weekdays")
ggplot(activity_interval_w, aes(interval,steps))+geom_line()+facet_grid(.~weekdays)