##Turn off scientific notation

options(scipen = 999) 

##We load the libraries that we are going to use

install.packages('dplyr')
#install.packages('lubridate')
install.packages('ggplot2')
install.packages('sqldf')
#install.packages('lattice')
#install.packages('Hmisc')
library(dplyr)
#library(lubridate)
library(ggplot2)
library(sqldf)
#library(lattice)
#library(Hmisc)

1. Loading and preprocessing the data

##Code for reading in the dataset and/or processing the data

setwd('F:/1. PROYECTOS DE TRABAJO/RStudio/5. Reproducible Research/RepData_PeerAssessment1/')
activity <- read.csv("activity.csv")

Remove NAs from dataset and then remove any date factors that are no longer relevant because all associated data points are NAs.

activity.cleaned <- activity[complete.cases(activity), ]
activity.cleaned$date <- factor(activity.cleaned$date)

##Exploring the basics of this data

dim(activity)
## [1] 17568     3
names(activity)
## [1] "steps"    "date"     "interval"
summary(activity)
##      steps                date          interval     
##  Min.   :  0.00   2012-10-01:  288   Min.   :   0.0  
##  1st Qu.:  0.00   2012-10-02:  288   1st Qu.: 588.8  
##  Median :  0.00   2012-10-03:  288   Median :1177.5  
##  Mean   : 37.38   2012-10-04:  288   Mean   :1177.5  
##  3rd Qu.: 12.00   2012-10-05:  288   3rd Qu.:1766.2  
##  Max.   :806.00   2012-10-06:  288   Max.   :2355.0  
##  NA's   :2304     (Other)   :15840
head(activity, 4)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
tibble::glimpse(activity)
## Observations: 17,568
## Variables: 3
## $ steps    <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ date     <fct> 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01...
## $ interval <int> 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 105, 11...
as.data.frame(sort(names(activity.cleaned)))
##   sort(names(activity.cleaned))
## 1                          date
## 2                      interval
## 3                         steps
#total number of missing data
sum(is.na(activity$steps))/dim(activity)[[1]]
## [1] 0.1311475
#transforming the date column into date format using lubridate
length(unique(activity$date))
## [1] 61

2. What is mean total number of steps taken per day?

##histogram with the frequency ot total numbers

steps_taken_per_day <- aggregate(activity.cleaned$steps, by = list(Steps.Date = activity.cleaned$date), FUN = 'sum')

#png('plot1.png')
hist(steps_taken_per_day$x, col = "blue", 
     breaks = 50,
     main = "Total steps taken each day",
     xlab = "Number of steps taken per day")
rug(steps_taken_per_day$x)
#abline(v = 10700, lwd = 4)
abline(v = median(steps_taken_per_day$x), col = 'green', lwd = 2) 
abline(v = mean(steps_taken_per_day$x), col = 'red', lwd = 8) 

#dev.off()

##calculate the mean and median of the total number of steps

mean(steps_taken_per_day[,2])
## [1] 10766.19
median(steps_taken_per_day[,2])
## [1] 10765

3. What is the average daily activity pattern?

steps_taken_per_five_min <- aggregate(activity.cleaned$steps, by = list(Interval = activity.cleaned$interval), FUN = "mean")

#png('plot2.png')
plot(steps_taken_per_five_min$Interval, steps_taken_per_five_min$x, type = "l", 
     main = "Daily Activity Pattern Average", 
     ylab = "Number Avarage of Steps Taken ", 
     xlab = "Intervals 5 min")

#dev.off()

##Interval number of steps

interval_number_steps <- which.max(steps_taken_per_five_min$x)
steps_taken_per_five_min[interval_number_steps,1]
## [1] 835

4. Imputing missing values

5. Are there differences in activity patterns between weekdays and weekends?