Reproducible Research: Peer Assessment 1

Loading and preprocessing the data

# Packages
suppressPackageStartupMessages({
  library(dplyr)
  library(lattice)
})

## Warning: package 'dplyr' was built under R version 4.4.3

# Read CSV (dataset is included in the repo)
activity <- read.csv("activity.csv", stringsAsFactors = FALSE)

# Ensure correct types
activity <- activity %>%
  mutate(
    date = as.Date(date),
    interval = as.integer(interval)
  )

# Quick structure check
str(activity)

## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Date, format: "2012-10-01" "2012-10-01" ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

What is mean total number of steps taken per day?

# Total steps per day (ignoring NA when summing)
steps_per_day <- activity %>%
  group_by(date) %>%
  summarize(total_steps = sum(steps, na.rm = TRUE), .groups = "drop")

# Histogram
hist(
  steps_per_day$total_steps,
  main = "Total Number of Steps Taken Each Day",
  xlab = "Total steps",
  breaks = 20
)

# Mean & median of total steps per day
mean_steps <- mean(steps_per_day$total_steps)
median_steps <- median(steps_per_day$total_steps)

mean_steps

## [1] 9354.23

median_steps

## [1] 10395

What is the average daily activity pattern?

# Average steps by 5-minute interval across all days
steps_by_interval <- activity %>%
  group_by(interval) %>%
  summarize(mean_steps = mean(steps, na.rm = TRUE), .groups = "drop")

# Time series
plot(
  steps_by_interval$interval,
  steps_by_interval$mean_steps,
  type = "l",
  xlab = "5-minute interval",
  ylab = "Average steps across all days"
)

# Interval with maximum average steps
max_interval <- steps_by_interval$interval[which.max(steps_by_interval$mean_steps)]
max_interval

## [1] 835

Imputing missing values

# Count missing values in 'steps'
missing_count <- sum(is.na(activity$steps))
missing_count

## [1] 2304

# Strategy: replace NA with the mean steps for that 5-minute interval
interval_means <- steps_by_interval  # columns: interval, mean_steps

# Join and replace NAs
activity_imputed <- activity %>%
  left_join(interval_means, by = "interval") %>%
  mutate(steps = ifelse(is.na(steps), mean_steps, steps)) %>%
  select(steps, date, interval)  # restore original column order

# Verify no NAs remain
sum(is.na(activity_imputed$steps))

## [1] 0

# Total steps per day with imputed data
steps_per_day_imputed <- activity_imputed %>%
  group_by(date) %>%
  summarize(total_steps = sum(steps), .groups = "drop")

# Histogram (after imputation)
hist(
  steps_per_day_imputed$total_steps,
  main = "Total Steps per Day (After Imputation)",
  xlab = "Total steps",
  breaks = 20
)

# Mean & median after imputation
mean_steps_imputed <- mean(steps_per_day_imputed$total_steps)
median_steps_imputed <- median(steps_per_day_imputed$total_steps)

mean_steps_imputed

## [1] 10766.19

median_steps_imputed

## [1] 10766.19

# Comparison table
data.frame(
  Metric = c("Mean (before)", "Median (before)", "Mean (after)", "Median (after)"),
  Value  = c(mean_steps, median_steps, mean_steps_imputed, median_steps_imputed)
)

##            Metric    Value
## 1   Mean (before)  9354.23
## 2 Median (before) 10395.00
## 3    Mean (after) 10766.19
## 4  Median (after) 10766.19

Are there differences in activity patterns between weekdays and weekends?

# Tag each date as weekday/weekend
activity_imputed <- activity_imputed %>%
  mutate(daytype = ifelse(weekdays(date) %in% c("Saturday", "Sunday"),
                          "weekend", "weekday"))

# Average by interval and daytype
steps_daytype <- activity_imputed %>%
  group_by(interval, daytype) %>%
  summarize(mean_steps = mean(steps), .groups = "drop")

# Panel time series plot
xyplot(
  mean_steps ~ interval | daytype,
  data = steps_daytype,
  type = "l",
  layout = c(1, 2),
  xlab = "5-minute interval",
  ylab = "Average steps"
)

Reproducible Research: Peer Assessment 1

Omkar Shashank Pathare

Loading and preprocessing the data

What is mean total number of steps taken per day?

What is the average daily activity pattern?

Imputing missing values

Are there differences in activity patterns between weekdays and weekends?