I first started by loading the data and transforming. In this data I plotted the total number of steps taken per day and calculated the mean and median.
library(ggplot2)
sample_base<- read.table(unz("./repdata-data-activity.zip", "activity.csv"), stringsAsFactors=FALSE,sep =",", header=TRUE, na.strings = "NA")
row.has.na<- apply(sample_base, 1, function(x){any(is.na(x))})
sample_final<-sample_base[!row.has.na,]
sample_final_steps<-aggregate(steps~date, data =sample_base, FUN=sum)
ggplot(sample_final_steps, aes(x = steps ),fill=steps)+geom_histogram(aes(y=..density..), binwidth=1000)+geom_density(alpha=0.2, fill="#FF6666")+xlab("Day")+ylab("Frequency")
x<-cbind(mean(sample_final_steps$steps),median(sample_final_steps$steps))
colnames(x)=c("Mean","Median")
x
## Mean Median
## [1,] 10766.19 10765
I then made a time series plot and caclulate the maximum interval to be from 8:35-8:40
sample_final_steps_interval<-aggregate(steps~interval, data = sample_final, FUN=mean)
ggplot(sample_final_steps_interval, aes(x=interval,y=steps))+geom_line()+xlab("5-min Interval")+ylab("Average steps")
sample_final_steps_interval[which.max(sample_final_steps_interval$steps),]
## interval steps
## 104 835 206.1698
Calculated the number of NA and filled them:
sum(row.has.na)
## [1] 2304
sample_base2 <- sample_base
for (i in sample_final_steps_interval$interval) {
sample_base2[sample_base2$interval == i & is.na(sample_base2$steps), ]$steps <-
sample_final_steps_interval$steps[sample_final_steps_interval$interval == i]
}
sample_final_steps2<-aggregate(steps~date, data =sample_base2, FUN=sum)
ggplot(sample_final_steps2, aes(x = steps ),fill=steps)+geom_histogram(aes(y=..density..), binwidth=1000)+geom_density(alpha=0.2, fill="#FF6666")+xlab("Day")+ylab("Frequency")
x2<-cbind(mean(sample_final_steps2$steps),median(sample_final_steps2$steps))
colnames(x2)=c("Mean","Median")
x2
## Mean Median
## [1,] 10766.19 10766.19
The mean and median values do not change and so this could be indeed a startegy to fill up the missing values.
Now,i constructed the time series for week and weekend.
sample_final['type_of_day'] <- weekdays(as.Date(sample_final$date))
sample_final$type_of_day[sample_final$type_of_day %in% c('Saturday','Sunday') ] <- "weekend"
sample_final$type_of_day[sample_final$type_of_day != "weekend"] <- "weekday"
sample_final$type_of_day <- as.factor(sample_final$type_of_day)
# calculate average steps by interval across all days
sample_base2_interval <- aggregate(steps ~ interval + type_of_day, sample_final, FUN=mean)
ggplot(sample_base2_interval, aes(x=interval,y=steps))+geom_line()+xlab("5-min Interval")+ylab("Average steps")+facet_wrap(~ type_of_day)