This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
#Reproducible Research: Peer
#Assessment 1
#========================================================
# 1. Code for reading in the dataset and/or processing the data
library(knitr);
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggthemes)
setwd("C:/Users/KRAUSE/Documents/CURSOS/coursera/Reproducible-Research-WH1")
activity <- read.csv("activity.csv")
#Exploring the basics of this data
dim(activity)
## [1] 17568 3
names(activity)
## [1] "steps" "date" "interval"
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
#total number of missing data
sum(is.na(activity$steps))/dim(activity)[[1]]
## [1] 0.1311475
#transforming the date column into date format
activity = activity %>%
mutate(date = as.Date(date))
You can also embed plots, for example:
# 2. Histogram of the total number of steps taken each day
# total number of steps taken per day
StepsTotal <- aggregate(steps ~ date, data=activity, sum, na.rm=TRUE)
summary(StepsTotal$steps)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41 8841 10765 10766 13294 21194
ggplot(data=StepsTotal, aes(steps)) +
geom_histogram(breaks=seq(0, 20000, by=500),col="darkgreen",
fill="green",
alpha = .2) +
labs(title="Daily Number of Steps", x="Steps Per Day", y="Number of Days") +
xlim(c(0,22000)) +
ylim(c(0,8))
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
# 3. Mean and median number of steps taken each day
# mean
mean(StepsTotal$steps)
## [1] 10766.19
# median
median(StepsTotal$steps)
## [1] 10765
# 4. Time series plot of the average number of steps taken
avg_steps_per_day <- aggregate(steps ~ date, activity, mean)
ggplot(avg_steps_per_day, aes(x = date, y = steps))+ geom_line( color="blue", size=0.1) +
theme_hc () +
scale_colour_manual("",values="blue")+
scale_fill_manual("",values="grey12")+
labs(title="Average number of steps", y="Steps Per Day", x="Date") +
theme(axis.text.x=element_text(angle=0, hjust=1), panel.background = element_rect(fill = NA))
# 5. The 5-minute interval that, on average, contains the maximum number of steps
avg_steps_per_interval <- aggregate(steps ~ interval, activity, mean)
interval_idx <- which.max(avg_steps_per_interval$steps)
# 6. Code to describe and show a strategy for imputing missing data
# Loop thru all the rows of activity, find the one with NA for steps.
# For each identify the interval for that row
# Then identify the avg steps for that interval in avg_steps_per_interval
# Substitute the NA value with that value
avg_steps_per_interval <- aggregate(steps ~ interval, activity, mean)
for (i in 1:nrow(activity)) {
if(is.na(activity$steps[i])) {
val <- lapply(avg_steps_per_interval$steps[which(avg_steps_per_interval$interval == activity$interval[i])], round,0)
activity$steps[i] <- as.numeric(val)
}
}
# 7. Histogram of the total number of steps taken each day after missing values are imputed
# Aggregate the steps per day with the imputed values
steps_per_day_impute <- aggregate(steps ~ date, activity, sum)
# Draw a histogram of the value
hist(steps_per_day_impute$steps, main = "Histogram of total number of steps per day (IMPUTED)", xlab = "Steps per day")
# 8. Panel plot comparing the average number of steps taken per 5-minute interval across weekdays and weekends
#Cretae a function to determine if the date is a weekday
week_day <- function(date_val) {
wd <- weekdays(as.Date(date_val, '%Y-%m-%d'))
if (!(wd == 'sábado' || wd == 'domingo')) {
x <- 'Weekday'
} else {
x <- 'Weekend'
}
x
}
# Apply the week_day function and add a new column to activity dataset
activity$day_type <- as.factor(sapply(activity$date, week_day))
#load the ggplot library
library(ggplot2)
# Create the aggregated data frame by intervals and day_type
steps_per_day_impute <- aggregate(steps ~ interval+day_type, activity, mean)
# Create the plot
plt <- ggplot(steps_per_day_impute, aes(interval, steps)) +
geom_line(stat = "identity", aes(colour = day_type)) +
theme_gray() +
facet_grid(day_type ~ ., scales="fixed", space="fixed") +
labs(x="Interval", y=expression("No of Steps")) +
ggtitle("No of steps Per Interval by day type")
print(plt)