Loading and preprocessing the data

# Packages
suppressPackageStartupMessages({
  library(dplyr)
  library(lattice)
})
## Warning: package 'dplyr' was built under R version 4.4.3
# Read CSV (dataset is included in the repo)
activity <- read.csv("activity.csv", stringsAsFactors = FALSE)

# Ensure correct types
activity <- activity %>%
  mutate(
    date = as.Date(date),
    interval = as.integer(interval)
  )

# Quick structure check
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

What is mean total number of steps taken per day?

# Total steps per day (ignoring NA when summing)
steps_per_day <- activity %>%
  group_by(date) %>%
  summarize(total_steps = sum(steps, na.rm = TRUE), .groups = "drop")

# Histogram
hist(
  steps_per_day$total_steps,
  main = "Total Number of Steps Taken Each Day",
  xlab = "Total steps",
  breaks = 20
)

# Mean & median of total steps per day
mean_steps <- mean(steps_per_day$total_steps)
median_steps <- median(steps_per_day$total_steps)

mean_steps
## [1] 9354.23
median_steps
## [1] 10395

What is the average daily activity pattern?

# Average steps by 5-minute interval across all days
steps_by_interval <- activity %>%
  group_by(interval) %>%
  summarize(mean_steps = mean(steps, na.rm = TRUE), .groups = "drop")

# Time series
plot(
  steps_by_interval$interval,
  steps_by_interval$mean_steps,
  type = "l",
  xlab = "5-minute interval",
  ylab = "Average steps across all days"
)

# Interval with maximum average steps
max_interval <- steps_by_interval$interval[which.max(steps_by_interval$mean_steps)]
max_interval
## [1] 835

Imputing missing values

# Count missing values in 'steps'
missing_count <- sum(is.na(activity$steps))
missing_count
## [1] 2304
# Strategy: replace NA with the mean steps for that 5-minute interval
interval_means <- steps_by_interval  # columns: interval, mean_steps

# Join and replace NAs
activity_imputed <- activity %>%
  left_join(interval_means, by = "interval") %>%
  mutate(steps = ifelse(is.na(steps), mean_steps, steps)) %>%
  select(steps, date, interval)  # restore original column order

# Verify no NAs remain
sum(is.na(activity_imputed$steps))
## [1] 0
# Total steps per day with imputed data
steps_per_day_imputed <- activity_imputed %>%
  group_by(date) %>%
  summarize(total_steps = sum(steps), .groups = "drop")

# Histogram (after imputation)
hist(
  steps_per_day_imputed$total_steps,
  main = "Total Steps per Day (After Imputation)",
  xlab = "Total steps",
  breaks = 20
)

# Mean & median after imputation
mean_steps_imputed <- mean(steps_per_day_imputed$total_steps)
median_steps_imputed <- median(steps_per_day_imputed$total_steps)

mean_steps_imputed
## [1] 10766.19
median_steps_imputed
## [1] 10766.19
# Comparison table
data.frame(
  Metric = c("Mean (before)", "Median (before)", "Mean (after)", "Median (after)"),
  Value  = c(mean_steps, median_steps, mean_steps_imputed, median_steps_imputed)
)
##            Metric    Value
## 1   Mean (before)  9354.23
## 2 Median (before) 10395.00
## 3    Mean (after) 10766.19
## 4  Median (after) 10766.19

Are there differences in activity patterns between weekdays and weekends?

# Tag each date as weekday/weekend
activity_imputed <- activity_imputed %>%
  mutate(daytype = ifelse(weekdays(date) %in% c("Saturday", "Sunday"),
                          "weekend", "weekday"))

# Average by interval and daytype
steps_daytype <- activity_imputed %>%
  group_by(interval, daytype) %>%
  summarize(mean_steps = mean(steps), .groups = "drop")

# Panel time series plot
xyplot(
  mean_steps ~ interval | daytype,
  data = steps_daytype,
  type = "l",
  layout = c(1, 2),
  xlab = "5-minute interval",
  ylab = "Average steps"
)