data.df <- read.csv(unz('activity.zip', 'activity.csv'), header=TRUE)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data.df.tbl <- tbl_df(data.df)
data.per.date <- data.df.tbl %>% filter(steps >= 0) %>%
group_by(date) %>% summarize(total = sum(steps), avg = mean(steps))
hist(data.per.date$total, main = "Total number of steps taken each day")
date.mean <- mean(data.per.date$total)
date.median <- median(data.per.date$total)
print (date.mean)
## [1] 10766.19
print (date.median)
## [1] 10765
data.per.interval <- data.df.tbl %>%
filter(steps >= 0) %>%
group_by(interval) %>%
summarize(total = sum(steps), avg = mean(steps))
with(data.per.interval
, plot(x = interval, y = avg, type="l", xlab = "Interval", ylab = "Average number of stpes"))
avg.steps.max <- data.per.interval[which.max(data.per.interval$avg),]
print (avg.steps.max)
## Source: local data frame [1 x 3]
##
## interval total avg
## 1 835 10927 206.1698
It has a maximum value of 206.1698113 on 835 5-minute interval.
sum(is.na(data.df$steps))
## [1] 2304
sum(is.na(data.df$date))
## [1] 0
sum(is.na(data.df$interval))
## [1] 0
steps.na.count <- sum(is.na(data.df$steps))
So, there is no missing value on date and interval vairables. The total number of missing values on stpes is 2304.
I will use the mean for that 5-minute interval for filling in all of the missing values in the dataset.
data.df.tbl2 <- data.df.tbl
steps.na <- is.na(data.df.tbl2$steps)
data.df.tbl2[steps.na, 1] <- with(data.per.interval, data.per.interval[interval == data.df.tbl2[steps.na, 3]]$avg)
data.per.date2 <- data.df.tbl2 %>%
filter(steps >= 0) %>%
group_by(date) %>%
summarize(total = sum(steps), avg = mean(steps))
hist(data.per.date2$total, main = "Total number of steps with imputing NA")
date2.mean <- mean(data.per.date2$total)
date2.median <- median(data.per.date2$total)
print (date2.mean)
## [1] 10766.19
print (date2.median)
## [1] 10766.19
data.df.tbl2$week_day <- weekdays(as.Date(data.df.tbl2$date))
data.df.tbl2$week_type <- with(data.df.tbl2, ifelse(week_day == "Sunday" | week_day == "Saturday"
, "weekend", "weekday"))
data.per.weektype <- data.df.tbl2 %>%
filter(steps >= 0) %>%
group_by(week_type, interval) %>%
summarize(total = sum(steps), avg = mean(steps))
library(lattice)
xyplot(avg ~ interval | week_type, type="l", xlab="Interval", ylab="Average number of steps"
, data = data.per.weektype, layout = c(1, 2))