Loading and preprocessing the data

# Get data from working directory
library(readr)
data <- read_csv("activity.zip", col_types = cols(date = col_datetime(format = "%Y-%m-%d")))

# Load libraries
library(tidyverse)
library(lubridate)

Since data is stored in YYYY - MM - DD format, but time of day in a 5 min interval where 0 means midnight and for example 815 mean 8:15am i’m going to reformat it to proper time data.

data$date <- ymd(data$date)

# data$interval2 <- sprintf("%04d", data$interval)
# data$time<- format(strptime(data$interval2, format="%H%M"), format = "%H:%M")
# data$daytime <- paste(data$date, data$time, sep= " ")
# data$daytime <- as.POSIXct(data$daytime, format="%Y-%m-%d %H:%M")

What is mean total number of steps taken per day?

The following histogram depicts the total number of steps each day:

hist_data<- data %>%
      group_by(date) %>%
      summarise(daily_steps = sum(steps, na.rm=TRUE))

ggplot(hist_data, aes(x=daily_steps)) + 
      geom_histogram(bins = 20) +
      labs(title="Histogram of the total number of steps taken each day", 
           x = "Daily steps", y = "Number of obs")

On average people where doing M = 9354.23 steps daily. Middle value of daily steps was Me = 1.039510^{4}

# Mean daily steps: 
round(mean(hist_data$daily_steps, na.rm = TRUE), 2)
## [1] 9354.23
# Median daily steps:
round(median(hist_data$daily_steps, na.rm = TRUE), 0)
## [1] 10395

What is the average daily activity pattern?

The following line graph depicts average steps of each 5min interval during all days of measurement:

ts_data <- data %>%
      group_by(interval) %>%
      summarise(steps = mean(steps, na.rm=TRUE))
      

ggplot(ts_data, aes(x = interval, y= steps)) + geom_line() + 
      labs(title="Average number of steps in 5min intrevals during day",
           x = "Time in 5min intervals", y= "Number of steps")

data %>%
      group_by(interval) %>%
      summarise(steps = mean(steps, na.rm=TRUE)) %>%
      arrange(desc(steps)) %>% 
      head(1)
## # A tibble: 1 x 2
##   interval    steps
##      <int>    <dbl>
## 1      835 206.1698

On average 8:35 a.m. contais the maximum number of steps

Imputing missing values

There is a total of 2304 NA values in the steps column

data %>% 
      filter(is.na(steps)) %>%
      count()
## # A tibble: 1 x 1
##       n
##   <int>
## 1  2304

In order to remove missing cases (NA values) I computed a 5min interval average across all days and replaced all NAs with it:

data %>% group_by(interval) %>% 
      mutate(mean_steps = mean(steps, na.rm = TRUE)) %>%
      ungroup() %>%
      select(steps, date, interval, mean_steps)
## # A tibble: 17,568 x 4
##    steps       date interval mean_steps
##    <int>     <date>    <int>      <dbl>
##  1    NA 2012-10-01        0  1.7169811
##  2    NA 2012-10-01        5  0.3396226
##  3    NA 2012-10-01       10  0.1320755
##  4    NA 2012-10-01       15  0.1509434
##  5    NA 2012-10-01       20  0.0754717
##  6    NA 2012-10-01       25  2.0943396
##  7    NA 2012-10-01       30  0.5283019
##  8    NA 2012-10-01       35  0.8679245
##  9    NA 2012-10-01       40  0.0000000
## 10    NA 2012-10-01       45  1.4716981
## # ... with 17,558 more rows

For future analysis I’ll keep a separate dataset where missing values have been replaced with computed averages

dataNoNA <- data %>% group_by(interval) %>% 
      mutate(steps = ifelse(is.na(steps), mean(steps, na.rm = TRUE), steps))

Above is a histogram of the total number of steps taken each day.

hist_data2<- dataNoNA %>%
      group_by(date) %>%
      summarise(daily_steps = sum(steps, na.rm=TRUE))

ggplot(hist_data2, aes(x=daily_steps)) + 
      geom_histogram(bins = 20) +
      labs(title="Histogram of the total number of steps taken each day", 
           x = "Daily steps", y = "Number of obs")

# Mean daily steps with NA's replaced:
M<- round(mean(hist_data2$daily_steps, na.rm = TRUE), 2)
M
## [1] 10766.19
# Median daily steps with NA's replaced:
Me<- round(median(hist_data2$daily_steps, na.rm = TRUE), 0)
Me
## [1] 10766

When NA values are replaced with 5min interval means for a given missing cell, the average of steps during a day is M = 1.07661910^{4}, and Median is Me = 1.076610^{4}

Are there differences in activity patterns between weekdays and weekends?

dataNoNA <- dataNoNA %>%
      mutate(weekday = if_else(weekdays(date) %in% c("sobota", "niedziela"), 
                               "weekend", "weekday"))

Following is a panel plot containing a time series plot of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis).

dataNoNA %>% group_by(weekday, interval) %>%
      summarise(steps = sum(steps)) %>% 
      ggplot(aes(x = interval, y= steps)) + 
      geom_line() + 
      labs(title="Average number of steps in 5min intrevals during day",
           x = "Time in 5min intervals", y= "Number of steps") + 
      facet_grid(.~weekday)