Set working directory
Load required R packages.
library(lubridate)
library(stringr)
library(dplyr)
library(ggplot2)
library(lattice)
library(knitr)
activity <- read.csv("1Data/activity.csv", header=TRUE, na.strings = "NA") ##read data
time0 <- c(activity$interval) ##set interval as a vector
time1 <- str_pad(time0, 4, pad="0") ##add leading 0's to values with stringr
activity$datetime <- ymd_hm(paste(activity$date, time1)) ##parse date-time with lubridate
str(activity)
## 'data.frame': 17568 obs. of 4 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## $ datetime: POSIXct, format: "2012-10-01 00:00:00" "2012-10-01 00:05:00" ...
summary(activity, na.rm=TRUE)
## steps date interval
## Min. : 0.00 2012-10-01: 288 Min. : 0.0
## 1st Qu.: 0.00 2012-10-02: 288 1st Qu.: 588.8
## Median : 0.00 2012-10-03: 288 Median :1177.5
## Mean : 37.38 2012-10-04: 288 Mean :1177.5
## 3rd Qu.: 12.00 2012-10-05: 288 3rd Qu.:1766.2
## Max. :806.00 2012-10-06: 288 Max. :2355.0
## NA's :2304 (Other) :15840
## datetime
## Min. :2012-10-01 00:00:00
## 1st Qu.:2012-10-16 05:58:45
## Median :2012-10-31 11:57:30
## Mean :2012-10-31 11:57:30
## 3rd Qu.:2012-11-15 17:56:15
## Max. :2012-11-30 23:55:00
##
#isolate date from datetime
activity$date <- as.Date(activity$datetime)
#dplyr group/sum steps
total.steps <- activity %>%
group_by(date) %>%
summarise(sum.steps = sum(steps,na.rm=TRUE), na=mean(is.na(steps)))
a <- ggplot(activity, aes(date, steps))
b <- a + geom_bar(stat = "identity", colour = "blue", fill = "blue", width = 0.7)
c <- b + labs(title = "Histogram of Total Number of Steps Taken Each Day",
x = "Date", y = "Total number of steps")
print(c)
#calculate mean and median steps per day
Mean.Steps <- as.integer(mean(total.steps$sum.steps))
Median.Steps <- as.integer(median(total.steps$sum.steps))
int.mean <- activity %>%
group_by(interval) %>%
summarise(StepCount=mean(steps, na.rm=TRUE))
plot(x = 1:nrow(int.mean), y=int.mean$StepCount, type="l",
col = "blue", xaxt = "n", xlab="Interval (24h Time of Day)",
ylab="Mean Steps across All Days", main="Mean Steps per Time Interval")
axis(1, labels=int.mean$interval[seq(1,288,12)],
at = seq_along(int.mean$interval)[seq(1,288,12)])
max.steps <- filter(int.mean, StepCount==max(StepCount))
na.count<- sum(is.na(activity$steps))
na.pct<- mean(is.na(activity$steps))
sum(is.na(activity))
## [1] 2304
naActivity <- is.na(activity$steps) #isolate NA's
actsplt <- split(activity, as.factor(activity$interval)) #isolate intervals
mean_int <- sapply(actsplt, function(x) mean(x$steps, na.rm=TRUE)) #mean steps by intervals
nday <- length(levels(as.factor(activity$date))) #count of days in dataset
act_fill <- activity
act_fill[naActivity,]$steps <- rep(mean_int, nday)[naActivity]
cbind(head(activity[naActivity,]), head(act_fill[naActivity,]))
## steps date interval datetime steps date
## 1 NA 2012-10-01 0 2012-10-01 00:00:00 1.7169811 2012-10-01
## 2 NA 2012-10-01 5 2012-10-01 00:05:00 0.3396226 2012-10-01
## 3 NA 2012-10-01 10 2012-10-01 00:10:00 0.1320755 2012-10-01
## 4 NA 2012-10-01 15 2012-10-01 00:15:00 0.1509434 2012-10-01
## 5 NA 2012-10-01 20 2012-10-01 00:20:00 0.0754717 2012-10-01
## 6 NA 2012-10-01 25 2012-10-01 00:25:00 2.0943396 2012-10-01
## interval datetime
## 1 0 2012-10-01 00:00:00
## 2 5 2012-10-01 00:05:00
## 3 10 2012-10-01 00:10:00
## 4 15 2012-10-01 00:15:00
## 5 20 2012-10-01 00:20:00
## 6 25 2012-10-01 00:25:00
sum(is.na(act_fill))
## [1] 0
a <- ggplot(act_fill, aes(date, steps))
b <- a + geom_bar(stat = "identity", colour = "blue", fill = "blue", width = 0.7)
c <- b + labs(title = "Histogram of Total Number of Steps Taken Each Day (with NA's replaced by mean per 5-minute interval)",
x = "Date", y = "Total number of steps")
print(c)
Mean before and after imputation
c(before=mean(tapply(activity$steps, as.factor(activity$date), function(x) sum(x, na.rm=T))),
after=mean(tapply(act_fill$steps, as.factor(act_fill$date), sum)))
## before after
## 9354.23 10766.19
Median before and after imputation
c(before=median(tapply(activity$steps, as.factor(activity$date), function(x) sum(x, na.rm=T))),
after=median(tapply(act_fill$steps, as.factor(act_fill$date), sum)))
## before after
## 10395.00 10766.19
activity$date <- as.Date(activity$date)
weekday <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday") #Create weekday vector
activity$wday <- factor((weekdays(activity$date) %in% weekday),
levels=c(FALSE, TRUE), labels=c("weekend","weekday")) #create two-level factor variable for weekend/weekday
str(activity$wday) #display structure of wday
## Factor w/ 2 levels "weekend","weekday": 2 2 2 2 2 2 2 2 2 2 ...
summary(activity$wday) #display summary of wday
## weekend weekday
## 4608 12960
act_fill <- aggregate(steps ~ interval + wday, data = activity, mean)
names(act_fill) <- c("interval", "wday", "steps")
xyplot(steps ~ interval | factor(wday),
data=act_fill,
type = "l",
layout = c(2,1),
xlab="5-minute Interval",
ylab="Mean Steps",
main="Mean Steps per Interval - Weekend vs. Weekday")
#knit2html()