sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
##
## Matrix products: default
##
## locale:
## [1] LC_COLLATE=English_United States.1252
## [2] LC_CTYPE=English_United States.1252
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.1252
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.25
##
## loaded via a namespace (and not attached):
## [1] compiler_3.6.1 magrittr_1.5 tools_3.6.1 htmltools_0.4.0
## [5] yaml_2.2.0 Rcpp_1.0.2 stringi_1.4.3 rmarkdown_2.1
## [9] stringr_1.4.0 xfun_0.10 digest_0.6.22 rlang_0.4.5
## [13] evaluate_0.14
#packages
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#load data
zip.url <- "https://github.com/coolhandluke999/RepData_PeerAssessment1/raw/master/activity.zip"
dir <- getwd()
zip.file <- "activity.zip"
zip.combine <- as.character(paste(dir, zip.file, sep = "/"))
download.file(zip.url, destfile = zip.combine)
unzip(zip.file)
data <- read.csv("activity.csv")
#clean data
#transform date from factor into date value
data$date <- as.Date(as.character(data$date))
#calculate total number of steps per day
sumSteps <- data %>% group_by(date) %>% summarize(totalSteps = sum(steps))
ggplot(data = sumSteps, mapping = aes(x = date, y = totalSteps)) +
geom_histogram(stat = "identity", color = "white", fill = "steelblue") +
labs(title = "Total Steps Per Day", x = "Date", y = "Total Steps")
#calculate and report the mean and median of the total number of steps taken per day
summary <- sumSteps %>% summarize(mean = mean(totalSteps, na.rm = TRUE),
median = median(totalSteps, na.rm = TRUE))
The mean and median total number of steps per:
library(xtable)
x <- xtable(summary)
print(x, type = "html")
| mean | median | |
|---|---|---|
| 1 | 10766.19 | 10765 |
#average daily activity pattern
avgStepsInt <- data %>% group_by(interval) %>% summarize(meanSteps = mean(steps, na.rm = TRUE))
ggplot(data = avgStepsInt, mapping = aes(x = interval, y = meanSteps)) +
geom_line()
#identify interval with highest steps
which.max(avgStepsInt$meanSteps)
## [1] 104
avgStepsInt$interval[104]
## [1] 835
Interval 835 with 206 steps on average per day is the daily interval with the highest number of daily steps.
Missing or NA values in the dataset are replaced here with imputed values. The imupted values are the mean number of steps by interval.
Where a missing step value exists it is replaced by the calculated mean number of steps for the interval that the missing value is associated with in the dataset.
#Total number of missing values in the dataset
sum(is.na(data))
## [1] 2304
#add interval step means column to every record in dataset
data$stepsAvg <- avgStepsInt$meanSteps
#assign mean interval value to any row with steps = NA
data$steps[which(is.na(data$steps))] = data$stepsAvg[which(is.na(data$steps))]
sum(is.na(data))
## [1] 0
which(is.na(data$steps))
## integer(0)
#create new data set equal to data minus the added interval mean column
newData <- data[,1:3]
stepsMeanDay <- newData %>% group_by(date) %>% summarize(meanValue = mean(steps),
medianValue = median(steps))
ggplot(data = newData, mapping = aes(x = date, y = steps)) +
geom_histogram(stat = "identity", fill = "steelblue") +
labs(title = "Total Steps Per Day", x = "Date", y = "Total Steps")
#steps increase dramatically when replacing NAs with interval means
#create factor variable for weekday and weekend types for each day
newData$dayOfWeek <- as.character(weekdays(newData$date))
weekday <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")
newData$dayType <- sapply(newData$dayOfWeek, FUN = function(x) ifelse((x %in% weekday), newData$dayType <- "Weekday", newData$dayType <- "Weekend"))
newData$dayType <- as.factor(newData$dayType)
stepsMeanDayType <- newData %>% group_by(dayType, interval) %>% summarize(meanValue = mean(steps))
ggplot(data = stepsMeanDayType, aes(x = interval, y = meanValue)) +
geom_line(color = "steelblue", size = 1) +
labs(title = "Average Number of Steps By Five Second Interval",
y = "Average Number of Steps",
x = "Five Second Interval") +
facet_wrap(~ dayType, ncol = 1)