## unzip file and read dataset into dataframe
unzip("activity.zip")
datActivity <- read.csv("activity.csv")
## convert the date column from character to date type
datActivity$date <- as.Date(datActivity$date, "%Y-%m-%d")
## display summary of dataset
str(datActivity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
## load libraries utilized in the analysis
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## summarize the dataset grouped by date and calculate the total steps per day
stepsbyday <- datActivity %>% group_by(date) %>% summarize(total=sum(steps, na.rm=TRUE))
## create a histogram of total steps per day
barplot(total~date, data=stepsbyday, axis.lty=1, xlab="Date", ylab="Steps per Day")
## show summary statistics of overall mean and median of the dataset
paste("Mean total number of steps per day:", round(mean(stepsbyday$total)))
## [1] "Mean total number of steps per day: 9354"
paste("Median total number of steps per day:", median(stepsbyday$total))
## [1] "Median total number of steps per day: 10395"
Step activity predominates in the morning time frame.
## summarize the dataset grouped by time interval and calculate the average steps per interval
stepsbytime <- datActivity %>% group_by(interval) %>% summarize(avg=mean(steps, na.rm=TRUE))
stepsbytime <- transform(stepsbytime, avg=round(avg))
## create time-series plot of the average steps per time interval
plot(stepsbytime$interval, stepsbytime$avg, type="l", xlab="Interval", ylab="Average Steps")
## determine the time interval with the maximum average steps and display the interval number and steps
maxVal <- stepsbytime[which.max(stepsbytime$avg),]
paste("The interval with the maximum average number of steps:", maxVal[1])
## [1] "The interval with the maximum average number of steps: 835"
paste("The maximum average number of steps in an interval:", maxVal[2])
## [1] "The maximum average number of steps in an interval: 206"
## calculate and display the number of rows with missing step data
paste("The number of rows with missing values for steps:", sum(is.na(datActivity$steps)))
## [1] "The number of rows with missing values for steps: 2304"
## impute NA values by copying the dataframe, iterating through each row and
## if the value is missing insert the corresponding average steps for that time interval
imputedActivity <- datActivity
for (i in 1:nrow(imputedActivity)) {
if (is.na(imputedActivity$steps[i]))
imputedActivity$steps[i] <- stepsbytime[stepsbytime$interval==imputedActivity$interval[i],2]
}
The overall shape of the histogram is not different except for fewer days with no steps recorded. The mean value increased significantly and is closer to the median.
## summarize the imputed dataset grouped by date and calculate the total steps per day
imputedstepsbyday <- imputedActivity %>% group_by(date) %>% summarize(total=sum(steps, na.rm=TRUE))
## create a histogram of total steps per day
barplot(total~date, data=imputedstepsbyday, axis.lty=1, xlab="Date", ylab="Steps per Day")
## show summary statistics of overall mean and median of the imputed dataset
paste("Mean total number of steps per day:", round(mean(imputedstepsbyday$total)))
## [1] "Mean total number of steps per day: 10766"
paste("Median total number of steps per day:", median(imputedstepsbyday$total))
## [1] "Median total number of steps per day: 10762"
On weekdays the step activity is predominantly in the morning, whereas on weekends it is more uniform throughout the day.
## add weekday/weekend factor variable in imputed dataset
imputedActivity$day <- as.factor(ifelse(weekdays(imputedActivity$date, abbreviate=TRUE) %in% c("Sat","Sun"),"weekend", "weekday"))
## summarize the imputed dataset grouped by whether weekday or weekend and time interval
## calculate the average steps in each time interval
wdaystepsbytime <- imputedActivity %>% group_by(day, interval) %>% summarize(avg=mean(steps, na.rm=TRUE), .groups="drop")
wdaystepsbytime <- transform(wdaystepsbytime, avg=round(avg))
## create time-series plots of the average steps per time interval for the weekdays and weekends
ggplot(wdaystepsbytime, aes(interval, avg)) + geom_line() + facet_grid(day~.)