sessionInfo()
## R version 3.6.1 (2019-07-05)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19041)
## 
## Matrix products: default
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] knitr_1.25
## 
## loaded via a namespace (and not attached):
##  [1] compiler_3.6.1  magrittr_1.5    tools_3.6.1     htmltools_0.4.0
##  [5] yaml_2.2.0      Rcpp_1.0.2      stringi_1.4.3   rmarkdown_2.1  
##  [9] stringr_1.4.0   xfun_0.10       digest_0.6.22   rlang_0.4.5    
## [13] evaluate_0.14

Loading and preprocessing the data

#packages
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#load data

zip.url <- "https://github.com/coolhandluke999/RepData_PeerAssessment1/raw/master/activity.zip"

dir <- getwd()
zip.file <- "activity.zip"
zip.combine <- as.character(paste(dir, zip.file, sep = "/"))

download.file(zip.url, destfile = zip.combine)

unzip(zip.file)

data <- read.csv("activity.csv")

#clean data
#transform date from factor into date value

data$date <- as.Date(as.character(data$date))

What is mean total number of steps taken per day?

#calculate total number of steps per day

sumSteps <- data %>% group_by(date) %>% summarize(totalSteps = sum(steps))


ggplot(data = sumSteps, mapping = aes(x = date, y = totalSteps)) +
        geom_histogram(stat = "identity", color = "white", fill = "steelblue") +
        labs(title = "Total Steps Per Day", x = "Date", y = "Total Steps")

#calculate and report the mean and median of the total number of steps taken per day

summary <- sumSteps %>% summarize(mean = mean(totalSteps, na.rm = TRUE),
                                  median = median(totalSteps, na.rm = TRUE))

The mean and median total number of steps per:

library(xtable)
x <- xtable(summary)
print(x, type = "html")
mean median
1 10766.19 10765

What is the average daily activity pattern?

#average daily activity pattern

avgStepsInt <- data %>% group_by(interval) %>% summarize(meanSteps = mean(steps, na.rm = TRUE))

ggplot(data = avgStepsInt, mapping = aes(x = interval, y = meanSteps)) + 
        geom_line()

#identify interval with highest steps

which.max(avgStepsInt$meanSteps)
## [1] 104
avgStepsInt$interval[104]
## [1] 835

Interval 835 with 206 steps on average per day is the daily interval with the highest number of daily steps.

Imputing missing values

Missing or NA values in the dataset are replaced here with imputed values. The imupted values are the mean number of steps by interval.

Where a missing step value exists it is replaced by the calculated mean number of steps for the interval that the missing value is associated with in the dataset.

#Total number of missing values in the dataset

sum(is.na(data))
## [1] 2304
#add interval step means column to every record in dataset
data$stepsAvg <- avgStepsInt$meanSteps

#assign mean interval value to any row with steps = NA
data$steps[which(is.na(data$steps))] = data$stepsAvg[which(is.na(data$steps))]

sum(is.na(data))
## [1] 0
which(is.na(data$steps))
## integer(0)
#create new data set equal to data minus the added interval mean column
newData <- data[,1:3]

stepsMeanDay <- newData %>% group_by(date) %>% summarize(meanValue = mean(steps),
                                                         medianValue = median(steps))


ggplot(data = newData, mapping = aes(x = date, y = steps)) +
        geom_histogram(stat = "identity", fill = "steelblue") +
        labs(title = "Total Steps Per Day", x = "Date", y = "Total Steps")

#steps increase dramatically when replacing NAs with interval means

Are there differences in activity patterns between weekdays and weekends?

#create factor variable for weekday and weekend types for each day

newData$dayOfWeek <- as.character(weekdays(newData$date))
weekday <- c("Monday", "Tuesday", "Wednesday", "Thursday", "Friday")

newData$dayType <- sapply(newData$dayOfWeek, FUN = function(x) ifelse((x %in% weekday), newData$dayType <- "Weekday", newData$dayType <- "Weekend"))

newData$dayType <- as.factor(newData$dayType)

stepsMeanDayType <- newData %>% group_by(dayType, interval) %>% summarize(meanValue = mean(steps))

ggplot(data = stepsMeanDayType, aes(x = interval, y = meanValue)) +
        geom_line(color = "steelblue", size = 1) +
        labs(title = "Average Number of Steps By Five Second Interval",
             y = "Average Number of Steps",
             x = "Five Second Interval") +
        facet_wrap(~ dayType, ncol = 1)