library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
Dataset for this project can be found in here “activity/activity.csv”
unzip("activity.zip")
activity_df <- read_csv("activity.csv")
## Parsed with column specification:
## cols(
## steps = col_double(),
## date = col_date(format = ""),
## interval = col_double()
## )
solution1a_df <- filter(activity_df, !is.na(steps)) %>%
group_by(date) %>%
summarise(total.steps = sum(steps))
hist(solution1a_df$total.steps,
main = "Histogram of total number of steps per day",
xlab = "Total steps per day")
rug(solution1a_df$total.steps)
#creating directory for saving figure file
dir.create(path = "figure")
## Copy my plot to a PNG file of width = 672px, height = 480px
dev.copy(png, file = "figure/plot1.png", width = 672, height = 480, units = "px" )
## png
## 3
## closing the PNG device!
dev.off()
## png
## 2
mean_day_steps <- mean(solution1a_df$total.steps)
median_day_steps <- median(solution1a_df$total.steps)
The mean number of total daily steps = 1.076618910^{4}
The median number of total daily steps = 1.076510^{4}
solution2a_df <- activity_df %>%
group_by(interval) %>%
summarise(avg.steps = mean(steps, na.rm = TRUE))
with(solution2a_df, plot(x = interval, y = avg.steps, type = "l",
main = "Time series plot of the interval and the average steps taken",
xlab = "Interval",
ylab = "Average steps"))
## Copy my plot to a PNG file of width = 672px, height = 480px
dev.copy(png, file = "figure/plot2.png", width = 672, height = 480, units = "px" )
## png
## 3
## closing the PNG device!
dev.off()
## png
## 2
filter(solution2a_df,
avg.steps == max(solution2a_df$avg.steps))
## # A tibble: 1 x 2
## interval avg.steps
## <dbl> <dbl>
## 1 835 206.
#This is the total number of rows with incomplete values
sum(!complete.cases(activity_df))
## [1] 2304
summary(activity_df)
## steps date interval
## Min. : 0.00 Min. :2012-10-01 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.:2012-10-16 1st Qu.: 588.8
## Median : 0.00 Median :2012-10-31 Median :1177.5
## Mean : 37.38 Mean :2012-10-31 Mean :1177.5
## 3rd Qu.: 12.00 3rd Qu.:2012-11-15 3rd Qu.:1766.2
## Max. :806.00 Max. :2012-11-30 Max. :2355.0
## NA's :2304
From the output above you see that missing values only exist for the the step variable (NA's :2304).
The idea is to use the mean for that 5-minute interval, because using the mean/median for the day date still produces NA/NaN and median for that 5-minute interval produces lots of zeros.
Stratagy for filling missing values using the mean for that interval:
1. Group the dataset by the interval variable and summarise the mean(steps, na.rm = TRUE) call it no_NA_df.
2. Inner-join no_NA_df and activity_df by their interval column call it no_NA_df.
3. Loop through the no_NA_df, assigning the value of no_NA_df$interval.mean.steps anywhere no_NA_df$steps is NA.
no_NA_df <- activity_df %>%
group_by(interval) %>%
summarise(interval.mean.steps = mean(steps, na.rm = TRUE))
no_NA_df <- inner_join(x = activity_df, y = no_NA_df, by = "interval")
for (i in 1:nrow(no_NA_df)) {
if(is.na(no_NA_df$steps[i])){
no_NA_df$steps[i] <- no_NA_df$interval.mean.steps[i]
}else
next
}
#using the `no_NA_df` dataset
day_total_steps_df <- no_NA_df %>%
group_by(date) %>%
summarise(total.steps = sum(steps))
hist(day_total_steps_df$total.steps,
main = "Histogram of total number of steps per day",
xlab = "Total steps per day")
rug(day_total_steps_df$total.steps)
## Copy my plot to a PNG file of width = 672px, height = 480px
dev.copy(png, file = "figure/plot3.png", width = 672, height = 480, units = "px" )
## png
## 3
## closing the PNG device!
dev.off()
## png
## 2
mean_day_steps_noNA <- mean(day_total_steps_df$total.steps)
median_day_steps_noNA <- median(day_total_steps_df$total.steps)
Former mean number of total daily steps = 1.076618910^{4}
Former median number of total daily steps = 1.076510^{4}
Current mean number of total daily steps = 1.076618910^{4}
Current median number of total daily steps = 1.076618910^{4}
Do these values differ from the estimates from the first part of the assignment? Yes.
Their mean are the same but median differ by 1.1886792 total steps.
#using the dataset with the filled-in missing values
solution4a_df <- no_NA_df %>%
mutate(day.type = if_else((weekdays(date) == "Sunday") | (weekdays(date) == "Saturday"),
true = "weekend", false = "weekday")) %>%
mutate(day.type = as.factor(day.type))
solution4b_df <- group_by(solution4a_df, day.type, interval) %>%
summarise(avg.steps = mean(steps))
#plotting
g <-ggplot(data = solution4b_df, aes(x = interval, y = avg.steps))
#modifing aesthetics
g <- g + facet_wrap(. ~ day.type, ncol = 1) + geom_line()
#modifying labels
g + labs(title = "Plot of 5-minute interval and average step") +
labs(subtitle = "Averaged across all weekday days or weekend days") +
labs(x = "Interval") +
labs(y = "Number of steps")
## Copy my plot to a PNG file of width = 672px, height = 480px
dev.copy(png, file = "figure/plot4.png", width = 672, height = 480, units = "px" )
## png
## 3
## closing the PNG device!
dev.off()
## png
## 2