Synopsis

In this section, I will use a dataset to perform a basic Exploratory Data Analysis (EDA). The dataset originates from a personal activity monitoring device that records the number of steps taken in 5-minute intervals throughout the day. The data were collected over a two-month period—from October to November 2012—from an anonymous individual.

You can find the dataset in the following link:

https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip

1 - Code for reading in the data and/or processing the data

Unzip the dataset

unzip("repdata_data_activity.zip")

Load the dataset

activity_data <- read.csv("activity.csv")

Check the structure of the data

str(activity_data)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

2 - Histogram of the total number of steps taken each day

Calculate total steps per day

total_steps_per_day <- tapply(activity_data$steps, activity_data$date, sum, na.rm = TRUE)

total_steps_per_day
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07 
##          0        126      11352      12116      13294      15420      11015 
## 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12 2012-10-13 2012-10-14 
##          0      12811       9900      10304      17382      12426      15098 
## 2012-10-15 2012-10-16 2012-10-17 2012-10-18 2012-10-19 2012-10-20 2012-10-21 
##      10139      15084      13452      10056      11829      10395       8821 
## 2012-10-22 2012-10-23 2012-10-24 2012-10-25 2012-10-26 2012-10-27 2012-10-28 
##      13460       8918       8355       2492       6778      10119      11458 
## 2012-10-29 2012-10-30 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 
##       5018       9819      15414          0      10600      10571          0 
## 2012-11-05 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11 
##      10439       8334      12883       3219          0          0      12608 
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17 2012-11-18 
##      10765       7336          0         41       5441      14339      15110 
## 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 2012-11-24 2012-11-25 
##       8841       4472      12787      20427      21194      14478      11834 
## 2012-11-26 2012-11-27 2012-11-28 2012-11-29 2012-11-30 
##      11162      13646      10183       7047          0

Create a histogram of total steps per day

hist(total_steps_per_day, main = "Total steps per day", xlab = "Total Steps", col = "blue", breaks = 20)

3 - Mean and median number of steps taken each day

Mean and median of total steps per day

mean_steps <- mean(total_steps_per_day, na.rm = TRUE)
median_steps <- median(total_steps_per_day, na.rm = TRUE)

mean_steps
## [1] 9354.23
median_steps
## [1] 10395

4 - Time series plot of the average number of steps taken

Calculate the average number of steps per interval

average_steps_per_interval <- tapply(activity_data$steps, activity_data$interval, mean, na.rm = TRUE)

Time series plot of the average number of steps per interval

plot(names(average_steps_per_interval), average_steps_per_interval, type = "l", 
     xlab = "5-minute interval", ylab = "Average number of steps", main = "Average Daily Activity Pattern")

5 - The 5-minute interval that, on average, contains the maximum number of steps

5-minute interval with the maximum average steps

max_interval <- which.max(average_steps_per_interval)
max_interval_value <- names(average_steps_per_interval)[max_interval]

max_interval_value
## [1] "835"

6 - Code to describe and show a strategy for imputing missing data

Total number of missing values

total_na <- sum(is.na(activity_data$steps))

total_na
## [1] 2304

Impute missing values with the mean of the corresponding 5-minute interval

activity_data_imputed <- activity_data
for (i in 1:nrow(activity_data_imputed)) {
  if (is.na(activity_data_imputed$steps[i])) {
    activity_data_imputed$steps[i] <- average_steps_per_interval[as.character(activity_data_imputed$interval[i])]
  }
}

Check the structure of the imputed dataset

str(activity_data_imputed)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : num  1.717 0.3396 0.1321 0.1509 0.0755 ...
##  $ date    : chr  "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

Calculate total steps per day after imputing missing values

total_steps_imputed <- tapply(activity_data_imputed$steps, activity_data_imputed$date, sum)

7 - Histogram of the total number of steps taken each day after missing values are imputed

Histogram of total steps per day after imputing missing values

hist(total_steps_imputed, main = "Total steps per day (Imputed)", xlab = "Total Steps", col = "green", breaks = 20)

Mean and median of total steps per day after imputing missing values

mean_steps_imputed <- mean(total_steps_imputed)
median_steps_imputed <- median(total_steps_imputed)

mean_steps_imputed
## [1] 10766.19
median_steps_imputed
## [1] 10766.19

Create a new factor variable for weekday vs weekend

activity_data_imputed$date <- as.Date(activity_data_imputed$date)
activity_data_imputed$day_type <- ifelse(weekdays(activity_data_imputed$date) %in% c("Saturday", "Sunday"), "weekend", "weekday")

Check the first few rows to confirm the new variable

head(activity_data_imputed)
##       steps       date interval day_type
## 1 1.7169811 2012-10-01        0  weekday
## 2 0.3396226 2012-10-01        5  weekday
## 3 0.1320755 2012-10-01       10  weekday
## 4 0.1509434 2012-10-01       15  weekday
## 5 0.0754717 2012-10-01       20  weekday
## 6 2.0943396 2012-10-01       25  weekday

Load ggplot2 for plotting

library(ggplot2)

Aggregate steps per interval by day type (weekday or weekend)

steps_by_day_type <- aggregate(steps ~ interval + day_type, data = activity_data_imputed, FUN = mean)

8 - Panel plot comparing the average number of steps taken par 5-minute interval across weekdays and weekends

Panel plot comparing activity patterns

ggplot(steps_by_day_type, aes(x = interval, y = steps, color = day_type)) +
  geom_line() +
  facet_wrap(~day_type, ncol = 1) +
  labs(title = "Average Steps per Interval: Weekdays vs Weekends", x = "5-minute interval", y = "Average steps")