This file is the Course Project 1 report of the Reproducible Research course, which is part of the Data Science Program of the Johns Hopkins University.
Setup code.
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)
First we load the necessary libraries that are going to be useful to process the data.
library(dplyr)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(lattice)
setwd("~/Documents/Dokumente - MacBook Air/Data Science with R - Johns Hopkins University/Reproducible Research/JHU-Data_Science-Reproducible_Research")
activity <- read.csv("activity.csv", header=TRUE, sep=",")
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
activityPerDay <- activity %>% group_by(date) %>% summarise(Steps = sum(steps, na.rm=TRUE))
ggplot(activityPerDay, aes(x=Steps)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ylab("Frequency") +
xlab("Steps") +
ggtitle("Histogram steps per day")
stepsPerDayMean <- activity %>% group_by(date) %>% summarise(Mean = mean(steps, na.rm=TRUE))
ggplot(stepsPerDayMean, aes(x=date, y=Mean)) +
geom_bar(fill="#69b3a2", stat = "identity") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Mean of steps per 5 minutes interval") +
ylab("Mean per 5 minutes interval")
stepsPerDayMedian <- activity %>% group_by(date) %>% summarise(Median = median(steps, na.rm=TRUE))
ggplot(stepsPerDayMedian, aes(x=date, y=Median)) +
geom_point(fill="#69b3a2", stat = "identity") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Median of steps per 5 minutes interval") +
ylab("Median per 5 minutes interval")
totalStepsDay <- activity %>% group_by(date) %>% summarise(TotalSteps = sum(steps, na.rm=TRUE))
mymean <- mean(totalStepsDay$TotalSteps)
median <- as.numeric(median(totalStepsDay$TotalSteps))
print(paste("The mean steps per day is: ", mymean))
## [1] "The mean steps per day is: 9354.22950819672"
print(paste("The median steps per day is: ", median))
## [1] "The median steps per day is: 10395"
stepsPerIntervalAverage <- activity %>% group_by(interval) %>% summarise(StepsAverage = mean(steps, na.rm=TRUE))
ggplot(stepsPerIntervalAverage, aes(x=interval, y=StepsAverage)) +
geom_line(color="#69b3a2", alpha=0.9) +
xlab("Interval") +
ggtitle("Time series of the average number of steps")
maxSteps <- max(stepsPerIntervalAverage[,2])
interval <- stepsPerIntervalAverage[which(stepsPerIntervalAverage$StepsAverage == maxSteps), 1]
print(paste("The maximum number of steps per day is: ", maxSteps, "And the the interval is: ", as.numeric(interval)))
## [1] "The maximum number of steps per day is: 206.169811320755 And the the interval is: 835"
activity.noNA <- activity
print(paste("The total number of rows with NA values is: ", sum(is.na(activity.noNA$steps))))
## [1] "The total number of rows with NA values is: 2304"
for (i in 1:length(activity.noNA$steps)) {
if (is.na(activity.noNA[i, 1])) {
activity.noNA[i, 1] <- stepsPerIntervalAverage[match(activity.noNA[i, 3], stepsPerIntervalAverage$interval),
2]
}
}
activityPerDay.noNA <- activity.noNA %>% group_by(date) %>% summarise(Steps = sum(steps, na.rm=TRUE))
ggplot(activityPerDay.noNA, aes(x=Steps)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9) +
ylab("Frequency") +
xlab("Steps") +
ggtitle("Histogram steps per day")
stepsPerDayMean.noNA <- activity.noNA %>% group_by(date) %>% summarise(Mean = mean(steps, na.rm=TRUE))
ggplot(stepsPerDayMean.noNA, aes(x=date, y=Mean)) +
geom_bar(fill="#69b3a2", stat = "identity") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Mean of steps per 5 minutes interval") +
ylab("Mean per 5 minutes interval")
stepsPerDayMedian.noNA <- activity.noNA %>% group_by(date) %>% summarise(Median = median(steps, na.rm=TRUE))
ggplot(stepsPerDayMedian.noNA, aes(x=date, y=Median)) +
geom_point(fill="#69b3a2", stat = "identity") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank()) +
ggtitle("Median of steps per 5 minutes interval") +
ylab("Median per 5 minutes interval")
totalStepsDay.noNA <- activity.noNA %>% group_by(date) %>% summarise(TotalSteps = sum(steps, na.rm=TRUE))
mean.noNA <- mean(totalStepsDay.noNA$TotalSteps)
median.noNA <- as.numeric(median(totalStepsDay.noNA$TotalSteps))
print(paste("The mean steps per day is: ", mean.noNA))
## [1] "The mean steps per day is: 10766.1886792453"
print(paste("The median steps per day is: ", median.noNA))
## [1] "The median steps per day is: 10766.1886792453"
ggplot(stepsPerDayMean, aes(x=date, y=Mean)) +
geom_bar(fill="#69b3a2", stat = "identity") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
ggplot(stepsPerDayMean.noNA, aes(x=date, y=Mean)) +
geom_bar(fill="#69b3a2", stat = "identity") +
theme(axis.text.x=element_blank(),
axis.ticks.x=element_blank())
activity.noNA$date <- ymd(activity.noNA$date)
activity.noNA$weekday <- weekdays(activity.noNA$date)
activity.noNA$daytype <- ifelse(activity.noNA$weekday %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
activity.noNA$daytype <- as.factor(activity.noNA$daytype)
stepsPerIntervalAverage.noNA.weekdays <- activity.noNA %>% filter(daytype=="Weekday") %>% group_by(interval) %>% summarise(StepsAverage = mean(steps, na.rm=TRUE))
stepsPerIntervalAverage.noNA.weekends <- activity.noNA %>% filter(daytype=="Weekend") %>% group_by(interval) %>% summarise(StepsAverage = mean(steps, na.rm=TRUE))
ggplot(stepsPerIntervalAverage.noNA.weekdays, aes(x=interval, y=StepsAverage)) +
geom_line(color="#69b3a2", alpha=0.9) +
xlab("Interval") +
ggtitle("Time series of the average number of steps")
ggplot(stepsPerIntervalAverage.noNA.weekends, aes(x=interval, y=StepsAverage)) +
geom_line(color="#69b3a2", alpha=0.9) +
xlab("Interval") +
ggtitle("Time series of the average number of steps")
plotdata <- aggregate(steps ~ interval + daytype, activity.noNA, mean)
xyplot(steps ~ interval | factor(daytype), data=plotdata, aspect=1/3, type="l")