## #############################################################################
# Reproducible Research: Peer Assessment 1 / PA1_template.Rmd
#
# Gabriel Ortiz / gabeortiz@icloud.com
#
# Completes course project objectives by answering questions.
#
## #############################################################################
## Assumption
# This function will take in a working directory string, file url string,
# file name and import function type and load up your data file.
#
## Args
# wd: working directory
# lf: local file
# fu: file url
# fN: file name
# it: import type
# uf: unzip file
# zf: zip filename
#
## Return
# data.frame
load.data <- function(wd, lf = T, fu, fn, it = "csv", uf = F, zf, ... ){
# set working directory
setwd(wd)
# check if data folder exists and create it if not
if(!file.exists("./data")){ dir.create("./data") }
# create variable to store local file path
df <- paste0("./data/", fn)
# download the file
if(lf == F){ download.file(fu, destfile = df) }
# unzip the file
unzip(zf, exdir = "./data")
# read the data using the indicated import type
if(it == "csv"){ x <- read.csv(df) }
# return data.frame
return(x)
}
## load the course data file
activity <- load.data(
getwd(),
lf = T,
"N/A",
"activity.csv",
it = "csv",
uf = T,
"activity.zip"
)
names(activity)
## [1] "steps" "date" "interval"
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
head(activity, 10)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
## 7 NA 2012-10-01 30
## 8 NA 2012-10-01 35
## 9 NA 2012-10-01 40
## 10 NA 2012-10-01 45
## subset data frame to values without na for later use
without_na <- activity[complete.cases(activity),]
For this part of the assignment, you can ignore the missing values in the dataset.
## (total number of (steps taken per day))
total <- aggregate(steps ~ date, without_na, sum)
## add descriptive variable names
names(total)[2] <- "sum_steps"
## check out new data frame
head(total, 5)
## date sum_steps
## 1 2012-10-02 126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## plot histogram, using breaks purely for better visuals.
hist(
total$sum_steps,
col = "blue",
main = "Histogram of the (Total Number of (Steps Taken Each Day))",
xlab = "(Total Number of (Steps Taken Each Day))",
breaks = 20
)
## mean
mean(total$sum_steps)
## [1] 10766
## median
median(total$sum_steps)
## [1] 10765
## the average number of steps taken, averaged across all days for each 5-minute
## interval
interval <- aggregate(steps ~ interval, without_na, mean)
## add descriptive variable names
names(interval)[2] <- "mean_steps"
## check out new data frame
head(interval, 5)
## interval mean_steps
## 1 0 1.71698
## 2 5 0.33962
## 3 10 0.13208
## 4 15 0.15094
## 5 20 0.07547
## format plot margins to accommodate long text labels.
par(mai = c(1.2,1.5,1,1))
## plot time series
plot(
x = interval$interval,
y = interval$mean_steps,
type = "l",
main = "Time Series Plot of the 5-Minute Interval\n and the Average Number of Steps Taken, Averaged Across All Days",
xlab = "5-Minute Interval",
ylab = "Average Number of Steps Taken,\n Averaged Across All Days"
)
interval[interval$mean_steps==max(interval$mean_steps),]
## interval mean_steps
## 104 835 206.2
nrow(activity[is.na(activity$steps),])
## [1] 2304
I will use the mean for the 5-minute interval to populate NA values for a given internval.
## merge original activity data frame with interval data frame
newactivity <- merge(activity, interval, by = 'interval', all.y = F)
## merge NA values with averages rounding up for integers
newactivity$steps[is.na(newactivity$steps)] <- as.integer(
round(newactivity$mean_steps[is.na(newactivity$steps)]))
## drop and reorder columns to match original activity data frame
## http://stackoverflow.com/questions/4605206/drop-columns-r-data-frame
keeps <- names(activity)
newactivity <- newactivity[keeps]
## (total number of (steps taken per day))
newtotal <- aggregate(steps ~ date, newactivity, sum)
## add descriptive variable names
names(newtotal)[2] <- "sum_steps"
## check out new data frame
head(newtotal, 5)
## date sum_steps
## 1 2012-10-01 10762
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## plot histogram, using breaks purely for better visuals.
hist(
newtotal$sum_steps,
col = "blue",
main = "Histogram of the (Total Number of (Steps Taken Each Day))\nPart Deux",
xlab = "(Total Number of (Steps Taken Each Day)) Part Deux",
breaks = 20
)
## mean
mean(newtotal$sum_steps)
## [1] 10766
## median
median(newtotal$sum_steps)
## [1] 10762
They do differ, but ever so slightly.
mean(total) = 10766.19, while mean(newtotal) = 10765.64. Rounding produces the same value.
median(total) = 10765, while median(newtotal) = 10762. 3 step difference.
This seems to highly depend on how you impute the missing data. Since I used the average for a given interval, there was practically no difference because we basically pulled the averages closer to the inserted average value.
## create new data frame
newnewactivity <- newactivity
## set up logical/test vector
## https://class.coursera.org/repdata-004/forum/thread?thread_id=34#post-472
weekend <- weekdays(as.Date(newnewactivity$date)) %in% c("Saturday", "Sunday")
## Fill in weekday column
## https://class.coursera.org/repdata-004/forum/thread?thread_id=34#post-125
newnewactivity$daytype <- "weekday"
## replace "weekday" with "weekend" where day == Sat/Sun
## https://class.coursera.org/repdata-004/forum/thread?thread_id=34#post-472
newnewactivity$daytype[weekend == TRUE] <- "weekend"
## convert new character column to factor
newnewactivity$daytype <- as.factor(newnewactivity$daytype)
## Check out new data frame
str(newnewactivity)
## 'data.frame': 17568 obs. of 4 variables:
## $ steps : int 2 0 0 0 0 0 0 0 0 0 ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 54 28 37 55 46 20 47 38 56 ...
## $ interval: int 0 0 0 0 0 0 0 0 0 0 ...
## $ daytype : Factor w/ 2 levels "weekday","weekend": 1 1 2 1 2 1 2 1 1 2 ...
head(newnewactivity, 5)
## steps date interval daytype
## 1 2 2012-10-01 0 weekday
## 2 0 2012-11-23 0 weekday
## 3 0 2012-10-28 0 weekend
## 4 0 2012-11-06 0 weekday
## 5 0 2012-11-24 0 weekend
## double check
## https://class.coursera.org/repdata-004/forum/thread?thread_id=34#post-472
weekdays(as.Date(newnewactivity$date[3]))
## [1] "Sunday"
## the average number of steps taken, averaged across all days for each 5-minute
## interval
newinterval <- aggregate(steps ~ interval + daytype, newnewactivity, mean)
## add descriptive variable names
names(newinterval)[3] <- "mean_steps"
## check out new data frame
head(newinterval, 5)
## interval daytype mean_steps
## 1 0 weekday 2.28889
## 2 5 weekday 0.40000
## 3 10 weekday 0.15556
## 4 15 weekday 0.17778
## 5 20 weekday 0.08889
## plot time series
library(lattice)
xyplot(
mean_steps ~ interval | daytype,
newinterval,
type = "l",
layout = c(1,2),
main = "Time Series Plot of the 5-Minute Interval\nand the Average Number of Steps Taken,\nAveraged Across All Weekday Days or Weekend Days",
xlab = "5-Minute Interval",
ylab = "Average Number of Steps Taken"
)