library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages --------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.8
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## Warning: package 'ggplot2' was built under R version 3.5.2
## Warning: package 'dplyr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.2
## -- Conflicts ------------------------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
Show any code that is needed to: 1. Load the data (i.e. read.csv()) 2. Process/transform the data (if necessary) into a format suitable for your analysis
fname <- 'activity.csv'
df <- read.csv(fname)
tbl <- as_tibble(df)
For this part of the assignment, you can ignore the missing values in the dataset. 1. Calculate the total number of steps taken per day
steps_per_day <- tbl %>%
group_by(date) %>%
summarize(steps_per_day=sum(steps, na.rm=TRUE))
hist(steps_per_day$steps_per_day, main='Number of steps per day', xlab='Steps per day', ylab='Frequency')
The mean number of steps per day is:
mean(steps_per_day$steps_per_day)
## [1] 9354.23
The meadian number of steps per day is:
median(steps_per_day$steps_per_day)
## [1] 10395
Summarize the data by number of steps per time interval
steps_per_interval <- tbl %>%
group_by(interval) %>%
summarize(steps=mean(steps, na.rm=TRUE))
ggplot(data=steps_per_interval) +
geom_line((mapping=aes(x=interval, y=steps))) +
xlab('interval') +
ylab('Averiag number of steps')
index_interval_max <- which(steps_per_interval$steps == max(steps_per_interval$steps))
steps_per_interval$interval[index_interval_max]
## [1] 835
Find the number of na in steps
sapply(tbl, function(x) sum(is.na(x)))[1]
## steps
## 2304
Note that there are a number of days/intervals where there are missing values (coded as NAs). The presence of missing days may introduce bias into some calculations or summaries of the data.
sum(is.na(tbl$steps))
## [1] 2304
To fill missing values a function is made that replaces a value with the average number of steps taken in that interval.
avg_steps_per_interval <- function(interval){
steps_per_interval[match(interval, steps_per_interval$interval), ]$steps
}
tbl_no_missing_data <- mutate(tbl, steps = if_else(is.na(steps), as.integer(avg_steps_per_interval(interval)), steps))
steps_per_day_no_missing_data <- tbl_no_missing_data %>%
group_by(date) %>%
summarize(steps_per_day=sum(steps))
mean(steps_per_day_no_missing_data$steps_per_day)
## [1] 10749.77
median(steps_per_day_no_missing_data$steps_per_day)
## [1] 10641
Indeed the estimates differ from the first part of the assignment. Both the mean and median increased.
Check to see if the number of missing values is indeed 0
sum(is.na(tbl_no_missing_data$steps))
## [1] 0
hist(steps_per_day_no_missing_data$steps_per_day, main='Number of steps per day', xlab='Steps per day', ylab='Frequency')
# Are there differences in activity patterns between weekdays and weekends? For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.
tbl_no_missing_data <- mutate(tbl_no_missing_data, weekday=as.POSIXlt(tbl_no_missing_data$date)$wday)
tbl_no_missing_data <- mutate(tbl_no_missing_data, weekday=if_else(weekday %in% c(0, 6), 0, 1))
tbl_avg_steps_per_interval_no_missing_data <- tbl_no_missing_data %>%
group_by(interval) %>%
summarize(steps=mean(steps, na.rm=TRUE))
ggplot(data=tbl_no_missing_data, aes(x=interval, y=steps)) +
stat_summary(fun.y="mean", geom="line") +
xlab('interval') +
ylab('Averiag number of steps') +
facet_wrap(~weekday)