Loading and preprocessing the data

library(ggplot2)
library(dplyr)
library(data.table)
data_na <- read.csv("activity.csv")
head(data_na)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
data <- na.omit(data_na)
head(data)
##     steps       date interval
## 289     0 2012-10-02        0
## 290     0 2012-10-02        5
## 291     0 2012-10-02       10
## 292     0 2012-10-02       15
## 293     0 2012-10-02       20
## 294     0 2012-10-02       25

What is mean total number of steps taken per day?

passos <- data %>%
        group_by(date) %>%
                summarise(steps = sum(steps))
head(passos)
## # A tibble: 6 x 2
##   date       steps
##   <fct>      <int>
## 1 2012-10-02   126
## 2 2012-10-03 11352
## 3 2012-10-04 12116
## 4 2012-10-05 13294
## 5 2012-10-06 15420
## 6 2012-10-07 11015
media <- mean(passos$steps)
paste("A media foi de: ", media, sep = " " )
## [1] "A media foi de:  10766.1886792453"
paste("The mean is: ", media, sep = " " )
## [1] "The mean is:  10766.1886792453"
mediana <- median(passos$steps)
paste("A mediana foi de: ", mediana, sep = " " )
## [1] "A mediana foi de:  10765"
paste("The median is: ", mediana, sep = " " )
## [1] "The median is:  10765"
hist(passos$steps, xlab = "Numero de passos por dia", ylab = "Frequencia", main = "Histograma do Numero Total de Passos por Dia", col = "pink")
#abline(v = media,  col="green", lwd=3, lty=3)
abline(v=c(media,mediana), col=c("green", "red"), lty=c(3,2), lwd=c(3, 3))
legend("topright", c("Mean", "Median"), fill=c("Green", "red"))

What is the average daily activity pattern?

media_de_passadas <- data %>%
        group_by(interval) %>%
                summarise(steps = mean(steps))
head(media_de_passadas)
## # A tibble: 6 x 2
##   interval  steps
##      <int>  <dbl>
## 1        0 1.72  
## 2        5 0.340 
## 3       10 0.132 
## 4       15 0.151 
## 5       20 0.0755
## 6       25 2.09
plot(media_de_passadas$interval, media_de_passadas$steps, type="l", xlab="Intervalo de 5 minutos", ylab="Media de Passadas",main="Media Diaria de Passadas por Intervalo")

maior_intervalo <- media_de_passadas[which.max(media_de_passadas$steps),1]
paste("O Maior numero de passadas ao longo do dia foi: ", maior_intervalo, sep = " ")
## [1] "O Maior numero de passadas ao longo do dia foi:  835"
paste("The maximum number of steps by interval was: ", maior_intervalo, sep = " ")
## [1] "The maximum number of steps by interval was:  835"

Imputing missing values

size <- length(which(is.na(data_na$steps)))
paste("Numero de NAs: ", size, sep = " " )
## [1] "Numero de NAs:  2304"
paste("Number of NAs: ", size, sep = " " )
## [1] "Number of NAs:  2304"
data_na$steps = ifelse(is.na(data_na$steps), mean(data_na$steps, na.rm = T), data_na$steps)
head(data_na)
##     steps       date interval
## 1 37.3826 2012-10-01        0
## 2 37.3826 2012-10-01        5
## 3 37.3826 2012-10-01       10
## 4 37.3826 2012-10-01       15
## 5 37.3826 2012-10-01       20
## 6 37.3826 2012-10-01       25
size <- length(which(is.na(data_na$steps)))
paste("Numero de NAs: ", size, sep = " " )
## [1] "Numero de NAs:  0"
paste("Number of NAs: ", size, sep = " " )
## [1] "Number of NAs:  0"
filtro <- tapply(data_na$steps, data_na$date, sum)
head(filtro)
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 
##   10766.19     126.00   11352.00   12116.00   13294.00   15420.00
data_na$date <- as.POSIXct(data_na$date)
## Warning in strptime(xx, f <- "%Y-%m-%d %H:%M:%OS", tz = tz): unknown
## timezone 'zone/tz/2017c.1.0/zoneinfo/America/Sao_Paulo'
data_na$mes <- as.numeric(format(data_na$date, "%m"))
media_dia <- mean(filtro)
mediana_dia <- median(filtro)
hist(filtro, xlab = "Numero de Passos por dia", ylab = "Frequencia", main = "Histograma com o numero total de passos por dia com NAs tratados", col = "#2073d4")
abline(v=c(media_dia,mediana_dia), col=c("green", "red"), lty=c(3,2), lwd=c(3, 3))
legend("topright", c("Mean", "Median"), fill=c("Green", "red"))

Are there differences in activity patterns between weekdays and weekends?

data_na$dia <- weekdays(data_na$date)
head(data_na)
##     steps       date interval mes    dia
## 1 37.3826 2012-10-01        0  10 Monday
## 2 37.3826 2012-10-01        5  10 Monday
## 3 37.3826 2012-10-01       10  10 Monday
## 4 37.3826 2012-10-01       15  10 Monday
## 5 37.3826 2012-10-01       20  10 Monday
## 6 37.3826 2012-10-01       25  10 Monday
Sys.setlocale("LC_ALL","English")
## Warning in Sys.setlocale("LC_ALL", "English"): OS reports request to set
## locale to "English" cannot be honored
## [1] ""
data_na$tipo_dia <- ifelse(data_na$dia == "sabado" | data_na$dia == "domingo" | data_na$dia == "Saturday" | data_na$dia == "Sunday", "Weekend", "Weekday")
head(data_na)
##     steps       date interval mes    dia tipo_dia
## 1 37.3826 2012-10-01        0  10 Monday  Weekday
## 2 37.3826 2012-10-01        5  10 Monday  Weekday
## 3 37.3826 2012-10-01       10  10 Monday  Weekday
## 4 37.3826 2012-10-01       15  10 Monday  Weekday
## 5 37.3826 2012-10-01       20  10 Monday  Weekday
## 6 37.3826 2012-10-01       25  10 Monday  Weekday
data_na$tipo_dia <- as.factor(data_na$tipo_dia)
summary(data_na$tipo_dia)
## Weekday Weekend 
##   12960    4608
medias_dias <- data_na %>%
                      group_by(tipo_dia,interval) %>%
                      summarise(steps = mean(steps))
head(medias_dias)
## # A tibble: 6 x 3
## # Groups:   tipo_dia [1]
##   tipo_dia interval steps
##   <fct>       <int> <dbl>
## 1 Weekday         0  7.01
## 2 Weekday         5  5.38
## 3 Weekday        10  5.14
## 4 Weekday        15  5.16
## 5 Weekday        20  5.07
## 6 Weekday        25  6.30
ggplot(data = medias_dias, aes(x = interval, y = steps)) +
  facet_grid(tipo_dia ~ .) +
  geom_line(stat = "identity", aes(colour = tipo_dia)) +
  ggtitle("No de passadas por intervalo por tipo de Dia") +
  labs(x="Intervalo", y=expression("Numero de passos"))

Alternative Mode

weekday_db <- medias_dias[medias_dias$tipo_dia == "Weekday", ]
weekday_db
## # A tibble: 288 x 3
## # Groups:   tipo_dia [1]
##    tipo_dia interval steps
##    <fct>       <int> <dbl>
##  1 Weekday         0  7.01
##  2 Weekday         5  5.38
##  3 Weekday        10  5.14
##  4 Weekday        15  5.16
##  5 Weekday        20  5.07
##  6 Weekday        25  6.30
##  7 Weekday        30  5.61
##  8 Weekday        35  6.01
##  9 Weekday        40  4.98
## 10 Weekday        45  6.58
## # ... with 278 more rows
weekend_db <- medias_dias[medias_dias$tipo_dia == "Weekend", ]
weekend_db
## # A tibble: 288 x 3
## # Groups:   tipo_dia [1]
##    tipo_dia interval steps
##    <fct>       <int> <dbl>
##  1 Weekend         0  4.67
##  2 Weekend         5  4.67
##  3 Weekend        10  4.67
##  4 Weekend        15  4.67
##  5 Weekend        20  4.67
##  6 Weekend        25  7.92
##  7 Weekend        30  4.67
##  8 Weekend        35  4.67
##  9 Weekend        40  4.67
## 10 Weekend        45  5.05
## # ... with 278 more rows
data_weekdays<- ggplot(weekday_db) +
        geom_line(aes(x = interval, y = steps), colour = "red", linetype = "dashed") +
    xlab("Intervalos de Dias Uteis") +
    ylab("Passsos")

data_weekends <- ggplot(weekend_db) +
        geom_line(aes(x = interval, y = steps), colour = "blue", linetype = "dotted") +
  xlab("Intervalos de Fins de Semana") +
  ylab("Passsos")
#install.packages('gridExtra')
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(data_weekdays,data_weekends,  ncol=1, top = "Dias Uteis x Fim de Semana (Media de Passadas)", left = "Y", bottom = "X")