Reproducible Research. week 2. Course Project 1

0. Turn off scientific notation

options(scipen = 999) 

0.1. We load the libraries that we are going to use

packages <- c('dplyr', #For data manipulation.
              'lubridate', #To work with date-times and time-spans.
              'ggplot2', #For graphics
              'sqldf', #configure and transparently import a database
              'lattice', #Data visualization
              'Hmisc' #Useful functions for data analysis, 
              #high-level graphing, impute missing values and import and annotate data sets
              )

installed <- packages %in% installed.packages()

if(sum(installed == F) > 0) {
  install.packages(packages[!installed])
}
lapply(packages,require,character.only = T)

Loading and preprocessing the data.

Show any code that is needed to.

setwd('F:/1. PROYECTOS DE TRABAJO/RStudio/5. Reproducible Research/RepData_PeerAssessment1/')
activity <- read.csv('activity.csv')
Sys.setlocale('LC_TIME', 'English')
## [1] "English_United States.1252"

General exploratory analysis and data type.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
dim(activity)
## [1] 17568     3
as.data.frame(sort(names(activity)))
##   sort(names(activity))
## 1                  date
## 2              interval
## 3                 steps
head(activity, 10)
##    steps       date interval
## 1     NA 2012-10-01        0
## 2     NA 2012-10-01        5
## 3     NA 2012-10-01       10
## 4     NA 2012-10-01       15
## 5     NA 2012-10-01       20
## 6     NA 2012-10-01       25
## 7     NA 2012-10-01       30
## 8     NA 2012-10-01       35
## 9     NA 2012-10-01       40
## 10    NA 2012-10-01       45
str(activity)
## 'data.frame':    17568 obs. of  3 variables:
##  $ steps   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ date    : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ interval: int  0 5 10 15 20 25 30 35 40 45 ...
glimpse(activity)
## Observations: 17,568
## Variables: 3
## $ steps    <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ date     <fct> 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01...
## $ interval <int> 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 105, 11...

Basic statistics

lapply(activity, summary)
## $steps
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    0.00    0.00   37.38   12.00  806.00    2304 
## 
## $date
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07 
##        288        288        288        288        288        288        288 
## 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12 2012-10-13 2012-10-14 
##        288        288        288        288        288        288        288 
## 2012-10-15 2012-10-16 2012-10-17 2012-10-18 2012-10-19 2012-10-20 2012-10-21 
##        288        288        288        288        288        288        288 
## 2012-10-22 2012-10-23 2012-10-24 2012-10-25 2012-10-26 2012-10-27 2012-10-28 
##        288        288        288        288        288        288        288 
## 2012-10-29 2012-10-30 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04 
##        288        288        288        288        288        288        288 
## 2012-11-05 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11 
##        288        288        288        288        288        288        288 
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17 2012-11-18 
##        288        288        288        288        288        288        288 
## 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 2012-11-24 2012-11-25 
##        288        288        288        288        288        288        288 
## 2012-11-26 2012-11-27 2012-11-28 2012-11-29 2012-11-30 
##        288        288        288        288        288 
## 
## $interval
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0   588.8  1177.5  1177.5  1766.2  2355.0
#png('plot1.png')
pairs(activity)

#dev.off()

Conclusions

The calculation of the mean and the median has a value of 1177.5, for this reason the distribution of the interval variable is symmetric.

In the variable steps 2304 values are missing.

What is mean total number of steps taken per day?

1. Calculate the total number of steps taken per day.

# create and print number of steps per day
StepsPerDay <- aggregate(activity$steps, list(activity$date), FUN=sum)
colnames(StepsPerDay) <- c('Date', 'Steps')
head(StepsPerDay, 15)
##          Date Steps
## 1  2012-10-01    NA
## 2  2012-10-02   126
## 3  2012-10-03 11352
## 4  2012-10-04 12116
## 5  2012-10-05 13294
## 6  2012-10-06 15420
## 7  2012-10-07 11015
## 8  2012-10-08    NA
## 9  2012-10-09 12811
## 10 2012-10-10  9900
## 11 2012-10-11 10304
## 12 2012-10-12 17382
## 13 2012-10-13 12426
## 14 2012-10-14 15098
## 15 2012-10-15 10139

2. Histogram of the total number of steps taken each day.

library(ggplot2)
#png('plot2.png')
g <- ggplot(StepsPerDay, aes(Steps))
g+geom_histogram(boundary=0, binwidth=1000, col='blue', fill='red')+ggtitle('Histogram total number of steps taken per day')+xlab('Steps')+ylab('Frequency')+theme(plot.title = element_text(face='bold', size=12))+scale_x_continuous(breaks=seq(0,25000,2500))+scale_y_continuous(breaks=seq(0,18,2))
## Warning: Removed 8 rows containing non-finite values (stat_bin).

#dev.off()

3. Mean and median of total number of steps taken per day

mean(StepsPerDay$Steps, na.rm=TRUE)
## [1] 10766.19
median(StepsPerDay$Steps, na.rm=TRUE)
## [1] 10765

The mean has a value of 10766.19, while the mean has a value of 10765

What is the average daily activity pattern?

Make a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all days (y-axis)

# create table with steps per time
StepsPerTime <- aggregate(steps~interval,data=activity,FUN=mean,na.action=na.omit)
# variable time (more comprensible for the graph axis)
StepsPerTime$time <- StepsPerTime$interval/100
# draw the line plot
#png('plot3.png')
h <- ggplot(StepsPerTime, aes(time, steps))
h+geom_line(col='violet')+ggtitle('Average steps per time interval')+xlab('Time')+ylab('Steps')+theme(plot.title = element_text(face='bold', size=15))

#dev.off()

2. Which 5-minute interval, on average across all the days in the dataset, contains the maximum number of steps?

library(dplyr)
# table for dplyr
ST <- tbl_df(StepsPerTime)
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# find the column
ST %>% select(time, steps) %>% filter(steps==max(ST$steps))
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 1 x 2
##    time steps
##   <dbl> <dbl>
## 1  8.35  206.

##Imputing missing values

1. Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)

# table for dplyr
ACT <- tbl_df(activity)
# find the column
ACT %>% filter(is.na(steps)) %>% summarize(missing_values = n())
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 1 x 1
##   missing_values
##            <int>
## 1           2304

2. Devise a strategy for filling in all of the missing values in the dataset. The strategy does not need to be sophisticated. For example, you could use the mean/median for that day, or the mean for that 5-minute interval, etc.

# values without NA are imputed in a new column
activity$CompleteSteps <- ifelse(is.na(activity$steps), round(StepsPerTime$steps[match(activity$interval, StepsPerTime$interval)],0), activity$steps)

3. Create a new dataset that is equal to the original dataset but with the missing data filled in.

# new dataset activityFull
activityFull <- data.frame(steps=activity$CompleteSteps, interval=activity$interval, date=activity$date)
# see first 10 values of the new dataset
head(activityFull, 15)
##    steps interval       date
## 1      2        0 2012-10-01
## 2      0        5 2012-10-01
## 3      0       10 2012-10-01
## 4      0       15 2012-10-01
## 5      0       20 2012-10-01
## 6      2       25 2012-10-01
## 7      1       30 2012-10-01
## 8      1       35 2012-10-01
## 9      0       40 2012-10-01
## 10     1       45 2012-10-01
## 11     0       50 2012-10-01
## 12     0       55 2012-10-01
## 13     0      100 2012-10-01
## 14     1      105 2012-10-01
## 15     0      110 2012-10-01

4. Make a histogram of the total number of steps taken each day and Calculate and report the mean and median total number of steps taken per day. Do these values differ from the estimates from the first part of the assignment? What is the impact of imputing missing data on the estimates of the total daily number of steps?

StepsPerDayFull <- aggregate(activityFull$steps, list(activityFull$date), FUN=sum)
colnames(StepsPerDayFull) <- c('Date', 'Steps')
#png('plot4.png')
g <- ggplot(StepsPerDayFull, aes(Steps))
g+geom_histogram(boundary=0, binwidth=1500, col='violet', fill='blue')+ggtitle('Histogram of steps per day')+xlab('Steps')+ylab('Frequency')+theme(plot.title = element_text(face='bold', size=20))+scale_x_continuous(breaks=seq(0,25000,2500))+scale_y_continuous(breaks=seq(0,26,2))

#dev.off()
# Mean
mean(StepsPerDayFull$Steps)
## [1] 10765.64
#Median
median(StepsPerDayFull$Steps)
## [1] 10762

The low mean from 10766.19 to 10765.64 and the low median from 10765 to 10762 Reviewing the histogram it can be seen that the only interval that is changed is the one that oscillates 10,000 and 12500 steps, increased from a frequency of 18 to 26.

It is observed that when filling the gap of the missing values with the mean It has the disadvantage of modification of the distribution of the variable becoming more narrow as it reduces its variance and its Advantage is the ease of application of the method.

Are there differences in activity patterns between weekdays and weekends?

For this part the weekdays() function may be of some help here. Use the dataset with the filled-in missing values for this part.

1. Create a new factor variable in the dataset with two levels - “weekday” and “weekend” indicating whether a given date is a weekday or weekend day.

# Create variable with date in correct format
activityFull$RealDate <- as.Date(activityFull$date, format = '%Y-%m-%d')
# create a variable with weekdays name
activityFull$weekday <- weekdays(activityFull$RealDate)
# create a new variable indicating weekday or weekend
activityFull$DayType <- ifelse(activityFull$weekday=='Saturday' | activityFull$weekday=='Sunday', 'weekend','weekday')
# see first 10 values
head(activityFull, n=10)
##    steps interval       date   RealDate weekday DayType
## 1      2        0 2012-10-01 2012-10-01  Monday weekday
## 2      0        5 2012-10-01 2012-10-01  Monday weekday
## 3      0       10 2012-10-01 2012-10-01  Monday weekday
## 4      0       15 2012-10-01 2012-10-01  Monday weekday
## 5      0       20 2012-10-01 2012-10-01  Monday weekday
## 6      2       25 2012-10-01 2012-10-01  Monday weekday
## 7      1       30 2012-10-01 2012-10-01  Monday weekday
## 8      1       35 2012-10-01 2012-10-01  Monday weekday
## 9      0       40 2012-10-01 2012-10-01  Monday weekday
## 10     1       45 2012-10-01 2012-10-01  Monday weekday

2. Make a panel plot containing a time series plot (i.e. type = “l”) of the 5-minute interval (x-axis) and the average number of steps taken, averaged across all weekday days or weekend days (y-axis). See the README file in the GitHub repository to see an example of what this plot should look like using simulated data.

# create table with steps per time across weekdaydays or weekend days
StepsPerTimeDT <- aggregate(steps~interval+DayType,data=activityFull,FUN=mean,na.action=na.omit)
# variable time (more comprensible for the graph axis)
StepsPerTimeDT$time <- StepsPerTime$interval/100
# draw the line plot
#png('plot5.png')
j <- ggplot(StepsPerTimeDT, aes(time, steps))
j+geom_line(col='darkred')+ggtitle('Average steps per time interval: weekdays vs. weekends')+xlab('Time')+ylab('Steps')+theme(plot.title = element_text(face='bold', size=12))+facet_grid(DayType ~ .)

#dev.off()