First we load the data from the activity.csv. We use dplyr library for easier manipulation with the data. The dates are handled with the lubridate library.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
activity <- read.csv("/Users/arunasingh/Downloads/Projects/R Projects/activity.csv", header = TRUE, sep = ",", na.strings = "NA")
First, we calculate total number of steps taken each day; we omit the incomplete cases, i.e., when the number of steps is not reported (missing in the data).
activity_steps <- subset(activity, complete.cases(activity)) %>%
group_by(date) %>% summarise(total= sum(steps))
## `summarise()` ungrouping output (override with `.groups` argument)
Next, we make a histogram of the total number of steps in order to see its variance and distribution.
hist(activity_steps$total, breaks = 10, main = "Histogram of total number of steps taken each day", xlab = "Total number of steps per day")
Finally, we compute mean and median values of the total number of steps.
mean(activity_steps$total)
## [1] 10766.19
median(activity_steps$total)
## [1] 10765
Below, we can see as the number of steps changes throughout the day (averaged across all the days).
steps_per_interval <- activity %>%
group_by(interval) %>%
summarize(mean = mean(steps, na.rm = TRUE))
## `summarise()` ungrouping output (override with `.groups` argument)
plot(steps_per_interval, type = 'l', main = "Average number of steps per interval",
xlab = "Time interval", ylab = "Average number of steps")
The 5-minute interval with maximum number of steps is as follows:
max_steps <- max(steps_per_interval$mean)
subset(steps_per_interval, mean == max_steps)$interval
## [1] 835
The total number of rows with missing values:
sum(!complete.cases(activity))
## [1] 2304
We fill the missing values of steps with the average number of steps for the given 5-minute interval.
interval_means <- sapply(activity$interval,
function(x) subset(steps_per_interval, interval == x)$mean)
missing <- is.na(activity$steps)
activity_imputed <- activity
activity_imputed[missing, ]$steps <- interval_means[missing]
Next, we calculate total number of steps on the imputed data and visualize the histogram in order to compare it with the original data.
total_steps_imputed <- activity_imputed %>%
group_by(date) %>%
summarize(total = sum(steps))
## `summarise()` ungrouping output (override with `.groups` argument)
hist(total_steps_imputed$total, breaks = 10, main =
"Histogram of total number of steps per day with no missing values",
xlab = "Total number of steps per day")
For the sake of comparison, we also compute new mean and median values of the total number of steps.
inputed <- total_steps_imputed[!is.na(total_steps_imputed$total), ]
inputed
## # A tibble: 61 x 2
## date total
## <chr> <dbl>
## 1 2012-10-01 10766.
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 10766.
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # … with 51 more rows
mean(inputed$total)
## [1] 10766.19
median(inputed$total)
## [1] 10766.19
We can see that the chosen strategy has no effect on the mean; the median is a bit higher and now equal to the mean.
We add a column day_in_week (a factor variable) which indicates whether the measuring occurred during a weekday or a weekend.
day_in_week <- function(date) {
wday <- wday(date)
is_weekend <- wday == 1 | wday == 6
day_in_week <- character(length = length(date))
day_in_week[is_weekend] <- "weekend"
day_in_week[!is_weekend] <- "weekday"
return(as.factor(day_in_week))
}
activity_imputed <- activity_imputed %>% mutate(day_in_week = day_in_week(date))
Finally, we plot the time series of number of steps per intervals comparing the weekdays and weekends. We use ggplot library for this purpose.
steps_per_interval_imputed <- activity_imputed %>%
group_by(interval, day_in_week) %>%
summarize(mean = mean(steps, na.rm = TRUE))
library(ggplot2)
g <- ggplot(steps_per_interval_imputed, aes(interval, mean))
g <- g + facet_grid(day_in_week ~ .)
g <- g + geom_line()
g + xlab("Interval") + ylab("Number of steps")