Loading and preprocessing the data
# Packages
suppressPackageStartupMessages({
library(dplyr)
library(lattice)
})
## Warning: package 'dplyr' was built under R version 4.4.3
# Read CSV (dataset is included in the repo)
activity <- read.csv("activity.csv", stringsAsFactors = FALSE)
# Ensure correct types
activity <- activity %>%
mutate(
date = as.Date(date),
interval = as.integer(interval)
)
# Quick structure check
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
What is mean total number of steps taken per day?
# Total steps per day (ignoring NA when summing)
steps_per_day <- activity %>%
group_by(date) %>%
summarize(total_steps = sum(steps, na.rm = TRUE), .groups = "drop")
# Histogram
hist(
steps_per_day$total_steps,
main = "Total Number of Steps Taken Each Day",
xlab = "Total steps",
breaks = 20
)

# Mean & median of total steps per day
mean_steps <- mean(steps_per_day$total_steps)
median_steps <- median(steps_per_day$total_steps)
mean_steps
## [1] 9354.23
median_steps
## [1] 10395
What is the average daily activity pattern?
# Average steps by 5-minute interval across all days
steps_by_interval <- activity %>%
group_by(interval) %>%
summarize(mean_steps = mean(steps, na.rm = TRUE), .groups = "drop")
# Time series
plot(
steps_by_interval$interval,
steps_by_interval$mean_steps,
type = "l",
xlab = "5-minute interval",
ylab = "Average steps across all days"
)

# Interval with maximum average steps
max_interval <- steps_by_interval$interval[which.max(steps_by_interval$mean_steps)]
max_interval
## [1] 835
Imputing missing values
# Count missing values in 'steps'
missing_count <- sum(is.na(activity$steps))
missing_count
## [1] 2304
# Strategy: replace NA with the mean steps for that 5-minute interval
interval_means <- steps_by_interval # columns: interval, mean_steps
# Join and replace NAs
activity_imputed <- activity %>%
left_join(interval_means, by = "interval") %>%
mutate(steps = ifelse(is.na(steps), mean_steps, steps)) %>%
select(steps, date, interval) # restore original column order
# Verify no NAs remain
sum(is.na(activity_imputed$steps))
## [1] 0
# Total steps per day with imputed data
steps_per_day_imputed <- activity_imputed %>%
group_by(date) %>%
summarize(total_steps = sum(steps), .groups = "drop")
# Histogram (after imputation)
hist(
steps_per_day_imputed$total_steps,
main = "Total Steps per Day (After Imputation)",
xlab = "Total steps",
breaks = 20
)

# Mean & median after imputation
mean_steps_imputed <- mean(steps_per_day_imputed$total_steps)
median_steps_imputed <- median(steps_per_day_imputed$total_steps)
mean_steps_imputed
## [1] 10766.19
median_steps_imputed
## [1] 10766.19
# Comparison table
data.frame(
Metric = c("Mean (before)", "Median (before)", "Mean (after)", "Median (after)"),
Value = c(mean_steps, median_steps, mean_steps_imputed, median_steps_imputed)
)
## Metric Value
## 1 Mean (before) 9354.23
## 2 Median (before) 10395.00
## 3 Mean (after) 10766.19
## 4 Median (after) 10766.19
Are there differences in activity patterns between weekdays and
weekends?
# Tag each date as weekday/weekend
activity_imputed <- activity_imputed %>%
mutate(daytype = ifelse(weekdays(date) %in% c("Saturday", "Sunday"),
"weekend", "weekday"))
# Average by interval and daytype
steps_daytype <- activity_imputed %>%
group_by(interval, daytype) %>%
summarize(mean_steps = mean(steps), .groups = "drop")
# Panel time series plot
xyplot(
mean_steps ~ interval | daytype,
data = steps_daytype,
type = "l",
layout = c(1, 2),
xlab = "5-minute interval",
ylab = "Average steps"
)
