In this section, I will use a dataset to perform a basic Exploratory Data Analysis (EDA). The dataset originates from a personal activity monitoring device that records the number of steps taken in 5-minute intervals throughout the day. The data were collected over a two-month period—from October to November 2012—from an anonymous individual.
You can find the dataset in the following link:
https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip
unzip("repdata_data_activity.zip")
activity_data <- read.csv("activity.csv")
str(activity_data)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
total_steps_per_day <- tapply(activity_data$steps, activity_data$date, sum, na.rm = TRUE)
total_steps_per_day
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07
## 0 126 11352 12116 13294 15420 11015
## 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12 2012-10-13 2012-10-14
## 0 12811 9900 10304 17382 12426 15098
## 2012-10-15 2012-10-16 2012-10-17 2012-10-18 2012-10-19 2012-10-20 2012-10-21
## 10139 15084 13452 10056 11829 10395 8821
## 2012-10-22 2012-10-23 2012-10-24 2012-10-25 2012-10-26 2012-10-27 2012-10-28
## 13460 8918 8355 2492 6778 10119 11458
## 2012-10-29 2012-10-30 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04
## 5018 9819 15414 0 10600 10571 0
## 2012-11-05 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 10439 8334 12883 3219 0 0 12608
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17 2012-11-18
## 10765 7336 0 41 5441 14339 15110
## 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 2012-11-24 2012-11-25
## 8841 4472 12787 20427 21194 14478 11834
## 2012-11-26 2012-11-27 2012-11-28 2012-11-29 2012-11-30
## 11162 13646 10183 7047 0
hist(total_steps_per_day, main = "Total steps per day", xlab = "Total Steps", col = "blue", breaks = 20)
mean_steps <- mean(total_steps_per_day, na.rm = TRUE)
median_steps <- median(total_steps_per_day, na.rm = TRUE)
mean_steps
## [1] 9354.23
median_steps
## [1] 10395
average_steps_per_interval <- tapply(activity_data$steps, activity_data$interval, mean, na.rm = TRUE)
plot(names(average_steps_per_interval), average_steps_per_interval, type = "l",
xlab = "5-minute interval", ylab = "Average number of steps", main = "Average Daily Activity Pattern")
max_interval <- which.max(average_steps_per_interval)
max_interval_value <- names(average_steps_per_interval)[max_interval]
max_interval_value
## [1] "835"
total_na <- sum(is.na(activity_data$steps))
total_na
## [1] 2304
activity_data_imputed <- activity_data
for (i in 1:nrow(activity_data_imputed)) {
if (is.na(activity_data_imputed$steps[i])) {
activity_data_imputed$steps[i] <- average_steps_per_interval[as.character(activity_data_imputed$interval[i])]
}
}
str(activity_data_imputed)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : num 1.717 0.3396 0.1321 0.1509 0.0755 ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
total_steps_imputed <- tapply(activity_data_imputed$steps, activity_data_imputed$date, sum)
hist(total_steps_imputed, main = "Total steps per day (Imputed)", xlab = "Total Steps", col = "green", breaks = 20)
mean_steps_imputed <- mean(total_steps_imputed)
median_steps_imputed <- median(total_steps_imputed)
mean_steps_imputed
## [1] 10766.19
median_steps_imputed
## [1] 10766.19
activity_data_imputed$date <- as.Date(activity_data_imputed$date)
activity_data_imputed$day_type <- ifelse(weekdays(activity_data_imputed$date) %in% c("Saturday", "Sunday"), "weekend", "weekday")
head(activity_data_imputed)
## steps date interval day_type
## 1 1.7169811 2012-10-01 0 weekday
## 2 0.3396226 2012-10-01 5 weekday
## 3 0.1320755 2012-10-01 10 weekday
## 4 0.1509434 2012-10-01 15 weekday
## 5 0.0754717 2012-10-01 20 weekday
## 6 2.0943396 2012-10-01 25 weekday
library(ggplot2)
steps_by_day_type <- aggregate(steps ~ interval + day_type, data = activity_data_imputed, FUN = mean)
ggplot(steps_by_day_type, aes(x = interval, y = steps, color = day_type)) +
geom_line() +
facet_wrap(~day_type, ncol = 1) +
labs(title = "Average Steps per Interval: Weekdays vs Weekends", x = "5-minute interval", y = "Average steps")