Loading and preprocessing the data

unzip(zipfile = "./activity.zip")
activity_dataset <- read.csv("activity.csv", sep =",")
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------ tidyverse 1.2.1 --

## v ggplot2 3.2.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.3
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## -- Conflicts --------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

library(reshape2)

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

What is the average total number of steps taken per day?

total_steps_day <- aggregate(activity_dataset$steps, by=list(Date=activity_dataset$date), na.rm=TRUE, FUN=sum)
mean_steps_day <- mean(total_steps_day$x, na.rm = TRUE)
median_steps_day <- median(total_steps_day$x, na.rm = TRUE)
print(c("Means Nr. of steps/day = ", round(mean_steps_day)))

## [1] "Means Nr. of steps/day = " "9354"

print(c("Median Nr. of steps/day = ", round(median_steps_day)))

## [1] "Median Nr. of steps/day = " "10395"

And here is a histogram of the distribution of the Total Nr. of steps/day:

## Plot histogram of Total # of steps/day
totaldayplot <- ggplot(total_steps_day, aes(x=x))
totaldayplot + geom_histogram(fill = "black", bins = 60) + ylab("Total # of steps / day") + ggtitle("Histogram of TOTAL # OF STEPS/DAY")

What is the average daily activity pattern?

## calculate mean # steps per interval
mean_steps_interval <- aggregate(activity_dataset$steps, by=list(Interval=activity_dataset$interval), na.rm=TRUE, FUN=mean)

## make plot
avg_daypatt_plot <- ggplot(data=mean_steps_interval, aes(Interval, x))
avg_daypatt_plot + geom_line(color="blue") + ylab("Average # of steps") + theme_bw() + ggtitle("Average # of steps per 5-min interval")

## which interval has the highest mean # of steps
print(c("The interval which has the highest AVERAGE of step is", mean_steps_interval[which.max(mean_steps_interval$x), 1]))

## [1] "The interval which has the highest AVERAGE of step is"
## [2] "835"

Imputing missing values

missing <- sum(is.na(activity_dataset))

###  the number of missing values is:  """"""""
print (c("The number of missing values in the dataset is", missing))

## [1] "The number of missing values in the dataset is"
## [2] "2304"

Missing data will be replaced with mean of the 5-min interval to which the NA value falls into.

First we create new Dataset based off the original dataset, so that we can transform it:

imputed_dset <- activity_dataset
x <- length(imputed_dset$steps)

And then we generate the imputations:

for (i in 1:x)  
{
    
    if (is.na(imputed_dset[i,1])) 
    { 
        z <- imputed_dset[i, 3]
        where_is_the_interval <- which(mean_steps_interval$Interval == z)
        imputed_dset[i, 1] <- mean_steps_interval[where_is_the_interval, 2]
    }
    
    
}


imputed_total_steps_day <- aggregate(imputed_dset$steps, by=list(Date=imputed_dset$date), na.rm=TRUE, FUN=sum)

We want to plot the 2 histograms side by side – so need to merge the 2 datasets, and then MELT them via variable “Original/Imputed value”:

combined <- cbind(total_steps_day, imputed_total_steps_day$x)
colnames(combined) <- c("Date","Original value", "Imputed value")

## Now melt
melted_for_steps <- melt(combined, id=c("Date"), measure.vars = c("Original value", "Imputed value"))

## Plot histogram
histocombined <- ggplot(melted_for_steps,aes(value)) + geom_histogram(binwidth = 1) + facet_wrap(~variable)
histocombined + geom_histogram(bins = 100, colour = "black")

## Calculate mean & median of Dataset with imputed values
mean_steps_day_imputed <- mean(combined$`Imputed value`, na.rm = TRUE)
median_steps_day_imputed <- median(combined$`Imputed value`, na.rm = TRUE)

######### Print this finding #############
print("AFTER having performed imputation of missing data, the new values for the Mean and Median are:")

## [1] "AFTER having performed imputation of missing data, the new values for the Mean and Median are:"

print(c("Means Nr. of steps/day = ", round(mean_steps_day_imputed)))

## [1] "Means Nr. of steps/day = " "10766"

print(c("Median Nr. of steps/day = ", round(median_steps_day_imputed)))

## [1] "Median Nr. of steps/day = " "10766"

Are there differences in activity patterns between weekdays and weekends?

First Find out which date was weekday and which was weekend (in the dataset with the imputed data):

imputed_dset$date <- as.Date(imputed_dset$date, format="%Y-%m-%d")
weekday <- weekdays(imputed_dset$date)
imputed_dset <- cbind(imputed_dset,weekday)

Then we create a factor variable that differentiates weekdays and weekend, and calculate the AVERAGE total Nr. steps taken (in 5 mins intervals) in weekdays and weekends:

imputed_dset$findeono <- sapply(imputed_dset$date, function(x) {
    if (weekdays(x) == "sábado" | weekdays(x) =="domingo") 
    {y <- "Weekend"} else 
    {y <- "Weekday"}
    y
})

imputed_dset$findeono <- as.factor(imputed_dset$findeono)

## Calculate AVERAGE total Nr. steps taken (in 5 mins intervals) in weekdays and weekends
Mean_by_weektime <- imputed_dset %>% group_by(findeono, interval) %>% summarise(mean_by_interval = mean(steps))

Finally, we plot this AVERAGE in 2 separate plots (one for the weekday period, another for the weekend):

plot_weekdays_vs_ends <- ggplot(Mean_by_weektime, aes(interval, mean_by_interval)) 
plot_weekdays_vs_ends + geom_line(colour = "blue") + facet_grid(findeono ~ .) + ylab("Average Nr. of steps")

Reproducible Research: Peer Assessment 1 Analizying personal workout monitoring data

Tomas A. Maccor

January 7, 2020

Loading and preprocessing the data

What is the average total number of steps taken per day?

What is the average daily activity pattern?

Imputing missing values

Are there differences in activity patterns between weekdays and weekends?