Reproducible Research: Peer Assessment 1

##Turn off scientific notation

options(scipen = 999)

##We load the libraries that we are going to use

install.packages('dplyr')
#install.packages('lubridate')
install.packages('ggplot2')
install.packages('sqldf')
#install.packages('lattice')
#install.packages('Hmisc')
library(dplyr)
#library(lubridate)
library(ggplot2)
library(sqldf)
#library(lattice)
#library(Hmisc)

1. Loading and preprocessing the data

##Code for reading in the dataset and/or processing the data

setwd('F:/1. PROYECTOS DE TRABAJO/RStudio/5. Reproducible Research/RepData_PeerAssessment1/')
activity <- read.csv("activity.csv")

Remove NAs from dataset and then remove any date factors that are no longer relevant because all associated data points are NAs.

activity.cleaned <- activity[complete.cases(activity), ]
activity.cleaned$date <- factor(activity.cleaned$date)

##Exploring the basics of this data

dim(activity)

## [1] 17568     3

names(activity)

## [1] "steps"    "date"     "interval"

summary(activity)

##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 12.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##  NA's   :2304     (Other)   :15840

head(activity, 4)

##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15

str(activity)

## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...

tibble::glimpse(activity)

## Observations: 17,568
## Variables: 3
## $ steps    <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ date     <fct> 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01...
## $ interval <int> 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 105, 11...

as.data.frame(sort(names(activity.cleaned)))

##   sort(names(activity.cleaned))
## 1                          date
## 2                      interval
## 3                         steps

#total number of missing data
sum(is.na(activity$steps))/dim(activity)[[1]]

## [1] 0.1311475

#transforming the date column into date format using lubridate
length(unique(activity$date))

## [1] 61

2. What is mean total number of steps taken per day?

##histogram with the frequency ot total numbers

steps_taken_per_day <- aggregate(activity.cleaned$steps, by = list(Steps.Date = activity.cleaned$date), FUN = 'sum')

#png('plot1.png')
hist(steps_taken_per_day$x, col = "blue", 
     breaks = 50,
     main = "Total steps taken each day",
     xlab = "Number of steps taken per day")
rug(steps_taken_per_day$x)
#abline(v = 10700, lwd = 4)
abline(v = median(steps_taken_per_day$x), col = 'green', lwd = 2) 
abline(v = mean(steps_taken_per_day$x), col = 'red', lwd = 8)

#dev.off()

##calculate the mean and median of the total number of steps

mean(steps_taken_per_day[,2])

## [1] 10766.19

median(steps_taken_per_day[,2])

## [1] 10765

3. What is the average daily activity pattern?

steps_taken_per_five_min <- aggregate(activity.cleaned$steps, by = list(Interval = activity.cleaned$interval), FUN = "mean")

#png('plot2.png')
plot(steps_taken_per_five_min$Interval, steps_taken_per_five_min$x, type = "l", 
     main = "Daily Activity Pattern Average", 
     ylab = "Number Avarage of Steps Taken ", 
     xlab = "Intervals 5 min")

#dev.off()

##Interval number of steps

interval_number_steps <- which.max(steps_taken_per_five_min$x)
steps_taken_per_five_min[interval_number_steps,1]

## [1] 835

Reproducible Research: Peer Assessment 1

1. Loading and preprocessing the data

2. What is mean total number of steps taken per day?

3. What is the average daily activity pattern?

4. Imputing missing values

5. Are there differences in activity patterns between weekdays and weekends?