This assignment makes use of data from a personal activity monitoring device. This device collects data at 5 minute intervals throughout the day. The data consists of two months of data from an anonymous individual collected during the months of October and November, 2012 and include the number of steps taken in 5 minute intervals each day.
This document presents the results from Project Assignment 1 in the Coursera course Reproducible Research, written in a single R markdown document that can be processed by knitr and transformed into an HTML file.
Show any code that is needed to
read.csv())rm(list = ls()); cat("\014")
# setwd("C:/Users/mXXXX/OneDrive/061 Coursera/spec_DataScience/datascienceCoursera_5ReproR/week2_assignment") #for Surface PC
setwd("C:/Users/marco/OneDrive/061 Coursera/spec_DataScience/datascienceCoursera_5ReproR/week2_assignment") #for Home PC
url_data <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
download.file(url_data, destfile = "./activity.zip",method = "auto")
# unzipp file
unzip(zipfile="./activity.zip")
dat <- read.csv(file = "activity.csv")
require(lubridate)
require(dplyr)
require(ggplot2)
dat$date <- ymd(dat$date)For this part of the assignment, you can ignore the missing values in the dataset.
dat.temp <- group_by(dat, date)
dat.summarize <- summarize(dat.temp
, sumSteps = sum(steps, na.rm = TRUE)
, meanSteps = mean(steps, na.rm =TRUE)
, sumInterval = sum(interval))
# let's get rid of the NaN values
dat.summarize_good <- dat.summarize[!is.na(dat.summarize$meanSteps), ]
step_mean <- mean(dat.summarize_good$sumSteps)
step_median <- median(dat.summarize_good$sumSteps)
ggplot(dat.summarize_good, aes(x = sumSteps)) +
geom_histogram(aes(fill = ..count..), colour = "grey4", binwidth = 1000) +
labs(title = "Histogram of Steps per day", x = "Steps per day", y = "Frequency")We have a mean value of 1.076618910^{4}
and a median value of 10765
type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)dat.temp2 <- group_by(dat, interval)
dat.summarize2 <- summarize(dat.temp2
, meanSteps = mean(steps, na.rm =TRUE)
)
# let's get rid of the NaN values
dat.summarize2_good <- dat.summarize2[!is.na(dat.summarize2$meanSteps), ]
ggplot(dat.summarize2_good, aes(x = interval, y = meanSteps)) +
geom_line(color = "firebrick")
dat.summarize2_good[which.max(dat.summarize2_good$meanSteps), ]
#> # A tibble: 1 x 2
#> interval meanSteps
#> <int> <dbl>
#> 1 835 206.1698Note that there are a number of days/intervals where there are missing values (coded as NA). The presence of missing days may introduce bias into some calculations or summaries of the data.
sum(is.na(dat$steps))
#> [1] 2304
# this time, with tapply
dat.filled <- dat
ndx <- is.na(dat.filled$steps)
int_avg <- tapply(dat.filled$steps, dat.filled$interval, mean, na.rm=TRUE, simplify=T)
dat.filled$steps[ndx] <- int_avg[as.character(dat.filled$interval[ndx])]
sum(is.na(dat.filled$steps))
#> [1] 0
dat.temp3 <- group_by(dat.filled, date)
dat.summarize3 <- summarize(dat.temp3
, sumSteps = sum(steps, na.rm = TRUE)
, meanSteps = mean(steps, na.rm =TRUE))
head(dat.summarize3)
#> # A tibble: 6 x 3
#> date sumSteps meanSteps
#> <date> <dbl> <dbl>
#> 1 2012-10-01 10766.19 37.38260
#> 2 2012-10-02 126.00 0.43750
#> 3 2012-10-03 11352.00 39.41667
#> 4 2012-10-04 12116.00 42.06944
#> 5 2012-10-05 13294.00 46.15972
#> 6 2012-10-06 15420.00 53.54167
step_mean_filled <- mean(dat.summarize3$sumSteps)
step_median_filled <- median(dat.summarize3$sumSteps)
ggplot(dat.summarize3, aes(x = sumSteps)) +
geom_histogram(aes(fill = ..count..), colour = "grey4", binwidth = 1000) +
labs(title = "Histogram of Steps per day", x = "Steps per day", y = "Frequency")For this part the **weekdays()** function may be of some help here. Use the dataset with the filled-in missing values for this part.
type = "l") of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis). See the README file in the GitHub repository to see an example of what this plot should look like using simulated data.
dat.filled <- mutate(dat.filled, weektype = ifelse(weekdays(dat.filled$date, abbreviate = FALSE) == "Samstag" | weekdays(dat.filled$date, abbreviate = FALSE) == "Sonntag", "weekend", "weekday"))
#> Warning: package 'bindrcpp' was built under R version 3.3.3
dat.filled$weektype <- as.factor(dat.filled$weektype)
head(dat.filled)
#> steps date interval weektype
#> 1 1.7169811 2012-10-01 0 weekday
#> 2 0.3396226 2012-10-01 5 weekday
#> 3 0.1320755 2012-10-01 10 weekday
#> 4 0.1509434 2012-10-01 15 weekday
#> 5 0.0754717 2012-10-01 20 weekday
#> 6 2.0943396 2012-10-01 25 weekday
dat.temp4 <- group_by(dat.filled, interval, weektype)
dat.summarize4 <- summarize(dat.temp4
, sumSteps = sum(steps, na.rm = TRUE)
, meanSteps = mean(steps, na.rm =TRUE))
head(dat.summarize4)
#> # A tibble: 6 x 4
#> # Groups: interval [3]
#> interval weektype sumSteps meanSteps
#> <int> <fctr> <dbl> <dbl>
#> 1 0 weekday 101.3018868 2.25115304
#> 2 0 weekend 3.4339623 0.21462264
#> 3 5 weekday 20.0377358 0.44528302
#> 4 5 weekend 0.6792453 0.04245283
#> 5 10 weekday 7.7924528 0.17316562
#> 6 10 weekend 0.2641509 0.01650943
s <- ggplot(dat.summarize4, aes(x=interval, y=meanSteps, color = weektype)) +
geom_line() +
facet_grid(. ~ weektype)
print(s)