library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
We’ll begin by creating a function that does the job of downloading the data from a connection.
#Function to download data from a file_url and place it in an appropriate location
get_files = function (filefile_url, filePath) {
data_folder = "./data"
if (!dir.exists(data_folder)) {
dir.create(data_folder)
}
if (!file.exists(filePath)) {
download.file(filefile_url, destfile = filePath)
}
}
file_url = "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2Factivity.zip"
Path = "./data/personal_activity_monitor_data.zip"
get_files(file_url, Path)
unzip(Path, exdir = "./data")
monitor_df = read.csv("./data/activity.csv")
First we will use the lubridate package to turn the date variable into a proper date time object
monitor_df$datetime = ymd(monitor_df$date)
steps_per_day = monitor_df %>%
group_by(datetime) %>%
summarise(totalsteps = sum(steps),
meansteps = mean(steps),
mediansteps = median(steps)
)
Here we will use a special argument stat to show that we will provide data to the y axis rather than it being generated from the count of the number of items in each bin as is the default.
ggplot(data=steps_per_day, aes(datetime, totalsteps)) +
geom_histogram(stat='identity') +
labs(title = "Total Steps per Day", subtitle = "Not Filtered for NAs")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
## Warning: Removed 8 rows containing missing values (position_stack).
# dev.copy(png, file = "hist1.png",width=480,height=480,units="px")
# dev.off()
mean_steps = mean(steps_per_day$meansteps, na.rm = TRUE)
median_steps = median(steps_per_day$mediansteps, na.rm = TRUE)
The mean is r mean_steps The median is
r median_steps
We will group by the interval and get the average across all the days. Then we plot.
by_interval = group_by(monitor_df, interval) %>%
summarize(intervalaverage = mean(steps, na.rm = TRUE))
ggplot(data = by_interval, aes(interval, intervalaverage)) +
geom_line()
# dev.copy(png, file = "hist2.png",width=480,height=480,units="px")
# dev.off()
max_interval = filter(by_interval, intervalaverage == max(by_interval$intervalaverage))$interval
The interval with the maximum number of steps is 835`
#Missing values
nas = sum(is.na(monitor_df$steps))
#Creating a copy of our original dataframe
#Filling in NAs
monitor_df_filled = monitor_df
monitor_df_filled[is.na(monitor_df_filled)] = 0
We summarize for the sum,
steps_per_day_filled = monitor_df_filled %>%
group_by(datetime) %>%
summarise(totalsteps = sum(steps, na.rm = TRUE),
meansteps = mean(steps, na.rm = TRUE),
mediansteps = median(steps, na.rm = TRUE)
)
new_mean_steps = mean(steps_per_day_filled$meansteps, na.rm = TRUE)
new_median_steps = median(steps_per_day_filled$mediansteps, na.rm = TRUE)
Number of NAs r nas The new mean is
r new_mean_steps The new median is
r new_median_steps
ggplot(data = steps_per_day_filled, aes(datetime, totalsteps)) +
geom_histogram(stat = "identity", fill = "blue") +
labs(title = "Total Steps per Day", subtitle = "Filtered for NAs")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
# dev.copy(png, file = "hist3.png",width=480,height=480,units="px")
# dev.off()
There does not seem to be any noticeable change in the data as a result of the removal of NA values.
We will add a new variable that specifies the day of week. This will allow us to filter for weekdays only and weekends only.
#Getting the day of week
day_of_week = weekdays(monitor_df_filled$datetime)
#Looping through to replace labels with appropriate weekday or weekend values
weekday_weekend = sapply(day_of_week, function(x) {
if(x == "Saturday" | x == "Sunday") {x = "Weekend"}
else {x = "Weekday"}
})
#Adding variable to dataframe
monitor_df_filled$dayofweek = weekday_weekend
Next we will group by the two variables:
- interval (step interval) - dayofweek
dayofweek_steps = monitor_df_filled %>%
group_by(dayofweek, interval) %>%
summarize(totalsteps = sum(steps))
## `summarise()` has grouped output by 'dayofweek'. You can override using the `.groups` argument.
Finally we can create a multi-panel plot
ggplot(data = dayofweek_steps, aes(interval, totalsteps)) +
geom_line(stat = "identity") +
facet_grid(.~ dayofweek)
# dev.copy(png, file = "hist4.png",width=480,height=480,units="px")
# dev.off()
The activity is higher on weekdays as compared to weekends.