library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)

Loading Dataset

We’ll begin by creating a function that does the job of downloading the data from a connection.

#Function to download data from a file_url and place it in an appropriate location
get_files = function (filefile_url, filePath) {
    data_folder = "./data"
    if (!dir.exists(data_folder)) {
        dir.create(data_folder)
    }
    
    if (!file.exists(filePath)) {
    download.file(filefile_url, destfile = filePath)
    }
}

file_url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
Path = "./data/personal_activity_monitor_data.zip"

get_files(file_url, Path)

Unzip to reveal contents

unzip(Path, exdir = "./data")

Reading the data into R

monitor_df = read.csv("./data/activity.csv")

Question 1

Dates

First we will use the lubridate package to turn the date variable into a proper date time object

monitor_df$datetime = ymd(monitor_df$date)

Summarize steps by date

steps_per_day = monitor_df %>% 
      group_by(datetime) %>%
      summarise(totalsteps = sum(steps),
                meansteps = mean(steps),
                mediansteps = median(steps)
                )

Plot the histogram

Here we will use a special argument stat to show that we will provide data to the y axis rather than it being generated from the count of the number of items in each bin as is the default.

ggplot(data=steps_per_day, aes(datetime, totalsteps)) +
      geom_histogram(stat='identity') + 
      labs(title = "Total Steps per Day", subtitle = "Not Filtered for NAs")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 8 rows containing missing values (position_stack).

# dev.copy(png, file = "hist1.png",width=480,height=480,units="px")
# dev.off()
mean_steps = mean(steps_per_day$meansteps, na.rm = TRUE)
median_steps = median(steps_per_day$mediansteps, na.rm = TRUE)

Figures

The mean is r mean_steps The median is r median_steps

Question 2 : Time Series Data

We will group by the interval and get the average across all the days. Then we plot.

by_interval = group_by(monitor_df, interval) %>%
                  summarize(intervalaverage = mean(steps, na.rm = TRUE))
ggplot(data = by_interval, aes(interval, intervalaverage)) +
      geom_line()

# dev.copy(png, file = "hist2.png",width=480,height=480,units="px")
# dev.off()
max_interval = filter(by_interval, intervalaverage == max(by_interval$intervalaverage))$interval

The interval with the maximum number of steps is 835`

Question 3

#Missing values
nas = sum(is.na(monitor_df$steps))

#Creating a copy of our original dataframe
#Filling in NAs
monitor_df_filled = monitor_df
monitor_df_filled[is.na(monitor_df_filled)] = 0

We summarize for the sum,

steps_per_day_filled = monitor_df_filled %>% 
                  group_by(datetime) %>%
                  summarise(totalsteps = sum(steps, na.rm = TRUE),
                            meansteps = mean(steps, na.rm = TRUE),
                            mediansteps = median(steps, na.rm = TRUE)
                            )
new_mean_steps = mean(steps_per_day_filled$meansteps, na.rm = TRUE)
new_median_steps = median(steps_per_day_filled$mediansteps, na.rm = TRUE)

Figures

Number of NAs r nas The new mean is r new_mean_steps The new median is r new_median_steps

ggplot(data = steps_per_day_filled, aes(datetime, totalsteps)) +
      geom_histogram(stat = "identity", fill = "blue") +
      labs(title = "Total Steps per Day", subtitle = "Filtered for NAs")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# dev.copy(png, file = "hist3.png",width=480,height=480,units="px")
# dev.off()

Conclusion

There does not seem to be any noticeable change in the data as a result of the removal of NA values.

Question 4 : Weekdays vs Weekends

We will add a new variable that specifies the day of week. This will allow us to filter for weekdays only and weekends only.

#Getting the day of week
day_of_week = weekdays(monitor_df_filled$datetime)

#Looping through to replace labels with appropriate weekday or weekend values
weekday_weekend = sapply(day_of_week, function(x) {
                        if(x == "Saturday" | x == "Sunday") {x = "Weekend"}
                        else {x = "Weekday"}
                  })

#Adding variable to dataframe
monitor_df_filled$dayofweek = weekday_weekend

Next we will group by the two variables:
- interval (step interval) - dayofweek

dayofweek_steps = monitor_df_filled %>%
      group_by(dayofweek, interval) %>%
      summarize(totalsteps = sum(steps))
## `summarise()` has grouped output by 'dayofweek'. You can override using the `.groups` argument.

Finally we can create a multi-panel plot

ggplot(data = dayofweek_steps, aes(interval, totalsteps)) +
      geom_line(stat = "identity") +
      facet_grid(.~ dayofweek)

# dev.copy(png, file = "hist4.png",width=480,height=480,units="px")
# dev.off()

The activity is higher on weekdays as compared to weekends.