Introduction

This file is the Course Project 1 report of the Reproducible Research course, which is part of the Data Science Program of the Johns Hopkins University.

Code

Setup code.

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(message = FALSE)

First we load the necessary libraries that are going to be useful to process the data.

library(dplyr)
library(ggplot2)
library(tidyverse)
library(lubridate)
library(lattice)

1. Code for reading in the dataset and/or processing the data.

setwd("~/Documents/Dokumente - MacBook Air/Data Science with R - Johns Hopkins University/Reproducible Research/JHU-Data_Science-Reproducible_Research")
activity <- read.csv("activity.csv", header=TRUE, sep=",")
head(activity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

2. Histogram of the total number of steps taken each day.

activityPerDay <- activity %>% group_by(date) %>% summarise(Steps = sum(steps, na.rm=TRUE))
ggplot(activityPerDay, aes(x=Steps)) + 
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9) +
  ylab("Frequency") +
  xlab("Steps") +
  ggtitle("Histogram steps per day")

3. Mean and median number of steps taken each day.

stepsPerDayMean <- activity %>% group_by(date) %>% summarise(Mean = mean(steps, na.rm=TRUE))
ggplot(stepsPerDayMean, aes(x=date, y=Mean)) + 
  geom_bar(fill="#69b3a2", stat = "identity") + 
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  ggtitle("Mean of steps per 5 minutes interval") +
  ylab("Mean per 5 minutes interval")

stepsPerDayMedian <- activity %>% group_by(date) %>% summarise(Median = median(steps, na.rm=TRUE))
ggplot(stepsPerDayMedian, aes(x=date, y=Median)) + 
  geom_point(fill="#69b3a2", stat = "identity") + 
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  ggtitle("Median of steps per 5 minutes interval") +
  ylab("Median per 5 minutes interval")

totalStepsDay <- activity %>% group_by(date) %>% summarise(TotalSteps = sum(steps, na.rm=TRUE))
mymean <- mean(totalStepsDay$TotalSteps)
median <- as.numeric(median(totalStepsDay$TotalSteps))

print(paste("The mean steps per day is: ", mymean))
## [1] "The mean steps per day is:  9354.22950819672"
print(paste("The median steps per day is: ", median))
## [1] "The median steps per day is:  10395"

4. Time series plot of the average number of steps taken.

stepsPerIntervalAverage <- activity %>% group_by(interval) %>% summarise(StepsAverage = mean(steps, na.rm=TRUE))
ggplot(stepsPerIntervalAverage, aes(x=interval, y=StepsAverage)) +
  geom_line(color="#69b3a2", alpha=0.9) + 
  xlab("Interval") +
  ggtitle("Time series of the average number of steps")

5. The 5-minute interval that, on average, contains the maximum number of steps.

maxSteps <- max(stepsPerIntervalAverage[,2])
interval <- stepsPerIntervalAverage[which(stepsPerIntervalAverage$StepsAverage == maxSteps), 1]
print(paste("The maximum number of steps per day is: ", maxSteps, "And the the interval is: ", as.numeric(interval)))
## [1] "The maximum number of steps per day is:  206.169811320755 And the the interval is:  835"

6. Code to describe and show a strategy for imputing missing data.

activity.noNA <- activity
print(paste("The total number of rows with NA values is: ", sum(is.na(activity.noNA$steps))))
## [1] "The total number of rows with NA values is:  2304"
for (i in 1:length(activity.noNA$steps)) {
  if (is.na(activity.noNA[i, 1])) {
     activity.noNA[i, 1] <- stepsPerIntervalAverage[match(activity.noNA[i, 3], stepsPerIntervalAverage$interval), 
                                                    2]
  }
}

7. Histogram of the total number of steps taken each day after missing values are imputed.

activityPerDay.noNA <- activity.noNA %>% group_by(date) %>% summarise(Steps = sum(steps, na.rm=TRUE))
ggplot(activityPerDay.noNA, aes(x=Steps)) + 
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.9) +
  ylab("Frequency") +
  xlab("Steps") +
  ggtitle("Histogram steps per day")

stepsPerDayMean.noNA <- activity.noNA %>% group_by(date) %>% summarise(Mean = mean(steps, na.rm=TRUE))
ggplot(stepsPerDayMean.noNA, aes(x=date, y=Mean)) + 
  geom_bar(fill="#69b3a2", stat = "identity") + 
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  ggtitle("Mean of steps per 5 minutes interval") +
  ylab("Mean per 5 minutes interval")

stepsPerDayMedian.noNA <- activity.noNA %>% group_by(date) %>% summarise(Median = median(steps, na.rm=TRUE))
ggplot(stepsPerDayMedian.noNA, aes(x=date, y=Median)) + 
  geom_point(fill="#69b3a2", stat = "identity") + 
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank()) +
  ggtitle("Median of steps per 5 minutes interval") +
  ylab("Median per 5 minutes interval")

totalStepsDay.noNA <- activity.noNA %>% group_by(date) %>% summarise(TotalSteps = sum(steps, na.rm=TRUE))
mean.noNA <- mean(totalStepsDay.noNA$TotalSteps)
median.noNA <- as.numeric(median(totalStepsDay.noNA$TotalSteps))

print(paste("The mean steps per day is: ", mean.noNA))
## [1] "The mean steps per day is:  10766.1886792453"
print(paste("The median steps per day is: ", median.noNA))
## [1] "The median steps per day is:  10766.1886792453"
ggplot(stepsPerDayMean, aes(x=date, y=Mean)) + 
  geom_bar(fill="#69b3a2", stat = "identity") + 
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

ggplot(stepsPerDayMean.noNA, aes(x=date, y=Mean)) + 
  geom_bar(fill="#69b3a2", stat = "identity") + 
  theme(axis.text.x=element_blank(),
        axis.ticks.x=element_blank())

8. Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends.

activity.noNA$date <- ymd(activity.noNA$date)
activity.noNA$weekday <- weekdays(activity.noNA$date)
activity.noNA$daytype <- ifelse(activity.noNA$weekday %in% c("Saturday", "Sunday"), "Weekend", "Weekday")
activity.noNA$daytype <- as.factor(activity.noNA$daytype)
stepsPerIntervalAverage.noNA.weekdays <- activity.noNA %>% filter(daytype=="Weekday") %>% group_by(interval) %>% summarise(StepsAverage = mean(steps, na.rm=TRUE))

stepsPerIntervalAverage.noNA.weekends <- activity.noNA %>% filter(daytype=="Weekend") %>% group_by(interval) %>% summarise(StepsAverage = mean(steps, na.rm=TRUE))
ggplot(stepsPerIntervalAverage.noNA.weekdays, aes(x=interval, y=StepsAverage)) +
  geom_line(color="#69b3a2", alpha=0.9) + 
  xlab("Interval") +
  ggtitle("Time series of the average number of steps")

ggplot(stepsPerIntervalAverage.noNA.weekends, aes(x=interval, y=StepsAverage)) +
  geom_line(color="#69b3a2", alpha=0.9) + 
  xlab("Interval") +
  ggtitle("Time series of the average number of steps")

plotdata <- aggregate(steps ~ interval + daytype, activity.noNA, mean)
xyplot(steps ~ interval | factor(daytype), data=plotdata, aspect=1/3, type="l")