#Reproducible Research: Week 2 Assignment
Before begininning, I set the working directory to the appropriate folder.
setwd("./rexercise/")
dir.create("reproducible")
## Warning in dir.create("reproducible"): 'reproducible' already exists
setwd("./reproducible/")
Install and load the relevant libraries.
install.packages("ggplot2")
## Installing package into '/home/henryskas/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library(ggplot2)
install.packages("data.table")
## Installing package into '/home/henryskas/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library("data.table")
install.packages("dplyr")
## Installing package into '/home/henryskas/R/x86_64-pc-linux-gnu-library/4.0'
## (as 'lib' is unspecified)
library("dplyr")
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(fileUrl, destfile = paste0(getwd(), '/repdata%2Fdata%2Factivity.zip'), method = "curl")
unzip("repdata%2Fdata%2Factivity.zip")
activityBT <- data.table::fread(input = "activity.csv")
activityDT <- na.omit(activityBT)
activityday <- group_by(activityDT, date)
activityday <- summarize(activityday, steps=sum(steps))
## `summarise()` ungrouping output (override with `.groups` argument)
summary(activityday)
## date steps
## Min. :2012-10-02 Min. : 41
## 1st Qu.:2012-10-16 1st Qu.: 8841
## Median :2012-10-29 Median :10765
## Mean :2012-10-30 Mean :10766
## 3rd Qu.:2012-11-16 3rd Qu.:13294
## Max. :2012-11-29 Max. :21194
ggplot(activityday, aes(x = steps)) +
geom_histogram(fill = "blue", binwidth = 1000) +
labs(title = "Daily Steps", x = "Steps", y = "Frequency")
3. Calculate and report the mean and median of the total number of steps taken per day
mean(activityday$steps)
## [1] 10766.19
median(activityday$steps)
## [1] 10765
activityintval <- group_by(activityDT, interval)
activityintval <- summarize(activityintval, steps=mean(steps))
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(activityintval, aes(x = interval , y = steps)) + geom_line(color="blue", size=1) + labs(title = "Avg. Daily Steps", x = "Interval", y = "Avg. Steps per day")
2. Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
df_data <- data.frame(interval=as.integer(names(activityintval)), avg=activityintval)
## Warning in data.frame(interval = as.integer(names(activityintval)), avg =
## activityintval): NAs introduced by coercion
max_steps <- max(df_data$avg.steps)
df_data[df_data$avg.steps == max_steps, ]
## interval avg.interval avg.steps
## 104 NA 835 206.1698
activityBT[is.na(steps), .N ]
## [1] 2304
# alternative solution
nrow(activityBT[is.na(steps),])
## [1] 2304
# Filling in missing values with median of dataset.
activityDT[is.na(steps), "steps"] <- activityDT[, c(lapply(.SD, median, na.rm = TRUE)), .SDcols = c("steps")]
# Filling in missing values with median of dataset.
activityBT[is.na(steps), "steps"] <- activityBT[, c(lapply(.SD, median, na.rm = TRUE)), .SDcols = c("steps")]
data.table::fwrite(x = activityBT, file = "tidyData.csv", quote = FALSE)
Total_Steps <- activityBT[, c(lapply(.SD, sum)), .SDcols = c("steps"), by = .(date)]
Total_Steps[, .(Mean_Steps = mean(steps), Median_Steps = median(steps))]
## Mean_Steps Median_Steps
## 1: 9354.23 10395
ggplot(Total_Steps, aes(x = steps)) + geom_histogram(fill = "blue", binwidth = 1000) + labs(title = "Daily Steps", x = "Steps", y = "Frequency")
Type of Estimate | Mean_Steps | Median_Steps — | — | — First Part (with na) | 10766.19 | 10765 Second Part (fillin in na with median) | 9354.23 | 10395
# Just recreating activityDT from scratch then making the new factor variable. (No need to, just want to be clear on what the entire process is.)
activityBT <- data.table::fread(input = "activity.csv")
activityBT[, date := as.POSIXct(date, format = "%Y-%m-%d")]
activityBT[, `Day of Week`:= weekdays(x = date)]
activityBT[grepl(pattern = "Monday|Tuesday|Wednesday|Thursday|Friday", x = `Day of Week`), "weekday or weekend"] <- "weekday"
activityBT[grepl(pattern = "Saturday|Sunday", x = `Day of Week`), "weekday or weekend"] <- "weekend"
activityBT[, `weekday or weekend` := as.factor(`weekday or weekend`)]
head(activityBT, 10)
## steps date interval Day of Week weekday or weekend
## 1: NA 2012-10-01 0 Monday weekday
## 2: NA 2012-10-01 5 Monday weekday
## 3: NA 2012-10-01 10 Monday weekday
## 4: NA 2012-10-01 15 Monday weekday
## 5: NA 2012-10-01 20 Monday weekday
## 6: NA 2012-10-01 25 Monday weekday
## 7: NA 2012-10-01 30 Monday weekday
## 8: NA 2012-10-01 35 Monday weekday
## 9: NA 2012-10-01 40 Monday weekday
## 10: NA 2012-10-01 45 Monday weekday
activityBT[is.na(steps), "steps"] <- activityBT[, c(lapply(.SD, median, na.rm = TRUE)), .SDcols = c("steps")]
IntervalBT <- activityBT[, c(lapply(.SD, mean, na.rm = TRUE)), .SDcols = c("steps"), by = .(interval, `weekday or weekend`)]
ggplot(IntervalBT , aes(x = interval , y = steps, color=`weekday or weekend`)) + geom_line() + labs(title = "Avg. Daily Steps by Weektype", x = "Interval", y = "No. of Steps") + facet_wrap(~`weekday or weekend` , ncol = 1, nrow=2)