##Turn off scientific notation
options(scipen = 999)
##We load the libraries that we are going to use
install.packages('dplyr')
#install.packages('lubridate')
install.packages('ggplot2')
install.packages('sqldf')
#install.packages('lattice')
#install.packages('Hmisc')
library(dplyr)
#library(lubridate)
library(ggplot2)
library(sqldf)
#library(lattice)
#library(Hmisc)
##Code for reading in the dataset and/or processing the data
setwd('F:/1. PROYECTOS DE TRABAJO/RStudio/5. Reproducible Research/RepData_PeerAssessment1/')
activity <- read.csv("activity.csv")
Remove NAs from dataset and then remove any date factors that are no longer relevant because all associated data points are NAs.
activity.cleaned <- activity[complete.cases(activity), ]
activity.cleaned$date <- factor(activity.cleaned$date)
##Exploring the basics of this data
dim(activity)
## [1] 17568 3
names(activity)
## [1] "steps" "date" "interval"
summary(activity)
## steps date interval
## Min. : 0.00 2012-10-01: 288 Min. : 0.0
## 1st Qu.: 0.00 2012-10-02: 288 1st Qu.: 588.8
## Median : 0.00 2012-10-03: 288 Median :1177.5
## Mean : 37.38 2012-10-04: 288 Mean :1177.5
## 3rd Qu.: 12.00 2012-10-05: 288 3rd Qu.:1766.2
## Max. :806.00 2012-10-06: 288 Max. :2355.0
## NA's :2304 (Other) :15840
head(activity, 4)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
tibble::glimpse(activity)
## Observations: 17,568
## Variables: 3
## $ steps <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ date <fct> 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01...
## $ interval <int> 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 105, 11...
as.data.frame(sort(names(activity.cleaned)))
## sort(names(activity.cleaned))
## 1 date
## 2 interval
## 3 steps
#total number of missing data
sum(is.na(activity$steps))/dim(activity)[[1]]
## [1] 0.1311475
#transforming the date column into date format using lubridate
length(unique(activity$date))
## [1] 61
##histogram with the frequency ot total numbers
steps_taken_per_day <- aggregate(activity.cleaned$steps, by = list(Steps.Date = activity.cleaned$date), FUN = 'sum')
#png('plot1.png')
hist(steps_taken_per_day$x, col = "blue",
breaks = 50,
main = "Total steps taken each day",
xlab = "Number of steps taken per day")
rug(steps_taken_per_day$x)
#abline(v = 10700, lwd = 4)
abline(v = median(steps_taken_per_day$x), col = 'green', lwd = 2)
abline(v = mean(steps_taken_per_day$x), col = 'red', lwd = 8)
#dev.off()
##calculate the mean and median of the total number of steps
mean(steps_taken_per_day[,2])
## [1] 10766.19
median(steps_taken_per_day[,2])
## [1] 10765
steps_taken_per_five_min <- aggregate(activity.cleaned$steps, by = list(Interval = activity.cleaned$interval), FUN = "mean")
#png('plot2.png')
plot(steps_taken_per_five_min$Interval, steps_taken_per_five_min$x, type = "l",
main = "Daily Activity Pattern Average",
ylab = "Number Avarage of Steps Taken ",
xlab = "Intervals 5 min")
#dev.off()
##Interval number of steps
interval_number_steps <- which.max(steps_taken_per_five_min$x)
steps_taken_per_five_min[interval_number_steps,1]
## [1] 835