Libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.5
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(lubridate)
## Warning: package 'lubridate' was built under R version 4.0.5
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
Loading and preprocessing the data
unzip("activity.zip")
data <- read.csv("activity.csv")
Exploring Data
dim(data)
## [1] 17568 3
names(data)
## [1] "steps" "date" "interval"
head(data)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
str(data)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : chr "2012-10-01" "2012-10-01" "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
Histogram of the total number of steps taken each day
histData <- with(data,
aggregate(steps, by = list(date),
FUN = sum,
na.rm = TRUE)
)
names(histData) <- c("date", "steps")
hist(histData$steps,
main = "Total number of steps taken by day",
xlab = "Total steps taken per day",
ylim = c(0,20),
col = "blue",
breaks = seq(0,25000, by=2500)
)

What is mean total number of steps taken per day?
Mean
mean(histData$steps)
## [1] 9354.23
What is the average daily activity pattern?
data_mean_dailyactivity <- aggregate(data$steps,
by=list(data$interval),
FUN=mean,
na.rm=TRUE)
names(data_mean_dailyactivity) <- c("interval", "mean")
plot(data_mean_dailyactivity$interval,
data_mean_dailyactivity$mean,
type = "l",
col="blue",
lwd = 2,
xlab="Interval",
ylab="Average number of steps",
main="Average number of steps per intervals")

Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?
data_mean_dailyactivity[which.max(data_mean_dailyactivity$mean), ]$interval
## [1] 835
Imputing missing values
Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
sum(is.na(data$steps))
## [1] 2304
Create a new dataset that is equal to the original dataset but with the missing data filled in.
data_cleaned <- transform(data,
steps = ifelse(is.na(data$steps),
yes = cleaned_steps, no = data$steps)
)
data_updated <- aggregate(steps ~ date, data_cleaned, sum)
names(data_updated) <- c("date", "daily_steps")
Are there differences in activity patterns between weekdays and weekends?
Create a new factor variable in the dataset with two levels – “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.
data$datetype <- sapply(data$date, function(x) {
if (weekdays(x) == "Saturday" | weekdays(x) =="Sunday")
{y <- "Weekend"} else
{y <- "Weekday"}
y
})
Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis). See the README file in the GitHub repository to see an example of what this plot should look like using simulated data.
data_by_date <- aggregate(steps~interval + datetype, data, mean, na.rm = TRUE)
plot<- ggplot(data_by_date, aes(x = interval , y = steps, color = datetype)) +
geom_line() +
labs(title = "Average daily steps by Weekend/Weekday",
x = "Interval",
y = "Avg number of steps") +
facet_wrap(~datetype, ncol = 1, nrow=2)
print(plot)
