knitr::opts_chunk$set(echo = TRUE)
library(ggplot2)
library(knitr)
library(RColorBrewer) #This one is to pimp the plots a bit :)
# Download the file in the actual working directory if the file does not exists
fileURL <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
if (!file.exists("./repdata%2Fdata%2Factivity.zip")) {
download.file(fileURL, destfile = "./repdata%2Fdata%2Factivity.zip")
}
unzip("./repdata%2Fdata%2Factivity.zip")
activity <- read.csv("activity.csv")
No further processes/modifications will be done to the data yet (see below for later steps)
# Calculate the total number of steps taken per day
stepsday <- with(activity, tapply(steps, date, sum))
# Histogram of the total number of steps taken each day
hist(stepsday, main = "Number of steps per day", xlab = "Steps taken each day", col = brewer.pal(6, "Accent") )
# Mean and median number of steps taken each day (these variables are nice for plotting but not actual show raw numbers)
stepsmean <- with(activity, tapply(steps, date, mean, na.rm = TRUE))
stepsmedian <- with(activity, tapply(steps, date, median, na.rm = TRUE))
# Reporting mean
mean(stepsday, na.rm = TRUE)
## [1] 10766.19
# Reporting the nmedian
median(stepsday, na.rm = TRUE)
## [1] 10765
# Average number of steps taken in the 5-minute interval
stepsinterval <- aggregate(steps ~ interval, data = activity, mean, na.rm = TRUE)
# Time for ploting the results
plot(steps ~ interval, data = stepsinterval, type = "l", lwd = 2, main = "Steps vs. Interval (daily average)",
ylab = "number of steps", xlab = "Interval")
# Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
stepsinterval[which.max(stepsinterval$steps), ]
## interval steps
## 104 835 206.1698
So we clearly see that ot is the interval 835 containing the maximun number of steps 206
But first lets see how many NA values we can find in the steps variable
# Total number of missing values in the dataset
sum(is.na(activity))
## [1] 2304
#Filling the blanks (NA data)
activityNAfilled <- activity # Make a new dataset with the original data
for (i in 1:nrow(activityNAfilled)) {
if (is.na(activityNAfilled$steps[i])) {
interval_value <- activityNAfilled$interval[i]
steps_value <- stepsinterval[stepsinterval$interval == interval_value, ]
activityNAfilled$steps[i] <- steps_value$steps
}
}
stepsdaynoNA <- with(activityNAfilled, tapply(steps, date, sum))
# Plot the histogram
hist(stepsdaynoNA, main = "Number of steps per day (NA values imputed)", xlab = "Steps taken each day", col = brewer.pal(6, "Accent") )
# First the mean
mean(stepsdaynoNA)
## [1] 10766.19
#...and now the median
median(stepsdaynoNA)
## [1] 10766.19
# Create a new factor variable in the dataset with two levels - "weekday" and "weekend"
activityNAfilled[, "day"] <- weekdays(as.Date(activityNAfilled$date))
activityNAfilled$day[activityNAfilled$day %in% c("lördag", "söndag")] <- "weekend"
activityNAfilled$day[activityNAfilled$day != "weekend"] <- "weekday"
# Changing to date to factor so we can print it in a decent way in the plot
activityNAfilled$day <- as.factor(activityNAfilled$day)
activityfinal <- aggregate(steps ~ interval + day, data = activityNAfilled, mean)
g <- ggplot(activityfinal, aes(interval, steps))
p <- g + geom_line() + facet_wrap(~day, ncol = 1) +
labs(y = "Number of steps") +
theme(panel.background = element_rect(fill = "white"),
panel.grid.major = element_line(colour = "grey40"))
print(p)