Loading and preprocessing the data
library(dplyr)
library(ggplot2)
activity <- read.csv("C:/Users/Jeremiah Lowhorn/Desktop/activity.csv",header=T)
activity$date <- as.Date(activity$date,format="%m/%d/%Y")
activity.drop <- activity[!is.na(activity$steps),]
na.activity <- activity[is.na(activity$steps),]
What is mean total number of steps taken per day?
perDay<- activity %>%
na.omit() %>%
group_by(date) %>%
arrange(date) %>%
summarise(sum_of_steps=sum(steps),
mean_of_steps=mean(steps),
median_of_steps=median(steps))
a <- ggplot(perDay,aes(x=sum_of_steps)) +
geom_histogram(binwidth=1000,fill="darkred",color="black") +
ggtitle(expression(atop("Histogram of Steps by Day"))) +
xlab("Sum of Steps by Day Grouping") +
ylab("Count of Days") +
theme(plot.title=element_text(size=36,color="black")) +
theme(axis.title=element_text(size=20,color="black")) +
theme(axis.text=element_text(size=10,color="black"))
print(a)
mean(perDay$sum_of_steps)
## [1] 10766.19
median(perDay$sum_of_steps)
## [1] 10765
What is the average daily activity pattern? 1. Make a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis).
perTimeInterval <- activity %>%
na.omit() %>%
group_by(interval) %>%
summarise(sum_of_steps=sum(steps),
mean_of_steps=mean(steps),
median_of_steps=median(steps)) %>%
arrange(desc(mean_of_steps))
b <- ggplot(perTimeInterval,aes(x=interval,y=mean_of_steps)) +
geom_line() +
ggtitle(expression(atop("Average Steps by Interval"))) +
xlab("5 Minute Interval") +
ylab("Average Steps") +
theme(plot.title=element_text(size=36,color="black")) +
theme(axis.title=element_text(size=20,color="black")) +
theme(axis.text=element_text(size=10,color="black"))
print(b)
perTimeInterval[1,]
## Source: local data frame [1 x 4]
##
## interval sum_of_steps mean_of_steps median_of_steps
## 1 835 10927 206.1698 19
Imputing missing values Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.
sum(is.na(activity$steps))
## [1] 2304
replace <- perTimeInterval[,c(1,3)]
na.replace <- left_join(na.activity,replace)
## Joining by: "interval"
na.replace <- na.replace[,-1]
na.replace <- rename(na.replace,steps=mean_of_steps)
activity.complete <- rbind(na.replace,activity.drop)
row.names(activity.complete)<-NULL
4.Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day. Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?
Answer: The mean has not changed because the missing values were replaced by the mean of the intervals. However, the median has changed due to the same reasoning as mention above. The days with missing values sum of steps is now the sum of means for those individual days.
perDay.complete<- activity.complete %>%
group_by(date) %>%
arrange(date) %>%
summarise(sum_of_steps=sum(steps),
mean_of_steps=mean(steps),
median_of_steps=median(steps))
c <- ggplot(perDay.complete,aes(x=sum_of_steps)) +
geom_histogram(binwidth=1000,fill="darkblue",color="black") +
ggtitle(expression(atop("Histogram of Steps by Day ~ Complete"))) +
xlab("Sum of Steps by Day Grouping") +
ylab("Count of Days") +
theme(plot.title=element_text(size=24,color="black")) +
theme(axis.title=element_text(size=20,color="black")) +
theme(axis.text=element_text(size=10,color="black"))
print(c)
mean(perDay.complete$sum_of_steps)
## [1] 10766.19
median(perDay.complete$sum_of_steps)
## [1] 10766.19
Are there differences in activity patterns between weekdays and weekends?
For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.
activity.complete<-activity.complete %>%
mutate(date,DOW=weekdays(date))
library(plyr)
activity.complete$DOW<-revalue(activity.complete$DOW,
c("Monday"="weekday",
"Tuesday"="weekday",
"Wednesday"="weekday",
"Thursday"="weekday",
"Friday"="weekday",
"Saturday"="weekend",
"Sunday"="weekend"))
activity.complete$DOW <- as.factor(activity.complete$DOW)
detach("package:plyr", unload=TRUE)
## Warning: 'plyr' namespace cannot be unloaded:
## namespace 'plyr' is imported by 'scales', 'reshape2', 'ggplot2' so cannot be unloaded
perTimeInterval.complete <- activity.complete %>%
group_by(interval,DOW) %>%
summarise(mean_of_steps=mean(steps)) %>%
arrange(desc(mean_of_steps))
library(ggplot2)
g <- ggplot(perTimeInterval.complete,aes(x=interval,y=mean_of_steps)) +
geom_line() +
facet_grid(DOW~.) +
ggtitle(expression(atop("Weekday vs Weekend"))) +
xlab("Interval") +
ylab("Average Steps") +
theme(plot.title=element_text(size=36,color="black")) +
theme(axis.title=element_text(size=20,color="black")) +
theme(axis.text=element_text(size=10,color="black"))
g