options(scipen = 999)
packages <- c('dplyr', #For data manipulation.
'lubridate', #To work with date-times and time-spans.
'ggplot2', #For graphics
'sqldf', #configure and transparently import a database
'lattice', #Data visualization
'Hmisc' #Useful functions for data analysis,
#high-level graphing, impute missing values and import and annotate data sets
)
installed <- packages %in% installed.packages()
if(sum(installed == F) > 0) {
install.packages(packages[!installed])
}
lapply(packages,require,character.only = T)
setwd('F:/1. PROYECTOS DE TRABAJO/RStudio/5. Reproducible Research/RepData_PeerAssessment1/')
activity <- read.csv('activity.csv')
Sys.setlocale('LC_TIME', 'English')
## [1] "English_United States.1252"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
dim(activity)
## [1] 17568 3
as.data.frame(sort(names(activity)))
## sort(names(activity))
## 1 date
## 2 interval
## 3 steps
head(activity, 10)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
## 7 NA 2012-10-01 30
## 8 NA 2012-10-01 35
## 9 NA 2012-10-01 40
## 10 NA 2012-10-01 45
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
glimpse(activity)
## Observations: 17,568
## Variables: 3
## $ steps <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ date <fct> 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01, 2012-10-01...
## $ interval <int> 0, 5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 100, 105, 11...
lapply(activity, summary)
## $steps
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 0.00 0.00 37.38 12.00 806.00 2304
##
## $date
## 2012-10-01 2012-10-02 2012-10-03 2012-10-04 2012-10-05 2012-10-06 2012-10-07
## 288 288 288 288 288 288 288
## 2012-10-08 2012-10-09 2012-10-10 2012-10-11 2012-10-12 2012-10-13 2012-10-14
## 288 288 288 288 288 288 288
## 2012-10-15 2012-10-16 2012-10-17 2012-10-18 2012-10-19 2012-10-20 2012-10-21
## 288 288 288 288 288 288 288
## 2012-10-22 2012-10-23 2012-10-24 2012-10-25 2012-10-26 2012-10-27 2012-10-28
## 288 288 288 288 288 288 288
## 2012-10-29 2012-10-30 2012-10-31 2012-11-01 2012-11-02 2012-11-03 2012-11-04
## 288 288 288 288 288 288 288
## 2012-11-05 2012-11-06 2012-11-07 2012-11-08 2012-11-09 2012-11-10 2012-11-11
## 288 288 288 288 288 288 288
## 2012-11-12 2012-11-13 2012-11-14 2012-11-15 2012-11-16 2012-11-17 2012-11-18
## 288 288 288 288 288 288 288
## 2012-11-19 2012-11-20 2012-11-21 2012-11-22 2012-11-23 2012-11-24 2012-11-25
## 288 288 288 288 288 288 288
## 2012-11-26 2012-11-27 2012-11-28 2012-11-29 2012-11-30
## 288 288 288 288 288
##
## $interval
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 588.8 1177.5 1177.5 1766.2 2355.0
#png('plot1.png')
pairs(activity)
#dev.off()
# create and print number of steps per day
StepsPerDay <- aggregate(activity$steps, list(activity$date), FUN=sum)
colnames(StepsPerDay) <- c('Date', 'Steps')
head(StepsPerDay, 15)
## Date Steps
## 1 2012-10-01 NA
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 NA
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## 11 2012-10-11 10304
## 12 2012-10-12 17382
## 13 2012-10-13 12426
## 14 2012-10-14 15098
## 15 2012-10-15 10139
library(ggplot2)
#png('plot2.png')
g <- ggplot(StepsPerDay, aes(Steps))
g+geom_histogram(boundary=0, binwidth=1000, col='blue', fill='red')+ggtitle('Histogram total number of steps taken per day')+xlab('Steps')+ylab('Frequency')+theme(plot.title = element_text(face='bold', size=12))+scale_x_continuous(breaks=seq(0,25000,2500))+scale_y_continuous(breaks=seq(0,18,2))
## Warning: Removed 8 rows containing non-finite values (stat_bin).
#dev.off()
mean(StepsPerDay$Steps, na.rm=TRUE)
## [1] 10766.19
median(StepsPerDay$Steps, na.rm=TRUE)
## [1] 10765
# create table with steps per time
StepsPerTime <- aggregate(steps~interval,data=activity,FUN=mean,na.action=na.omit)
# variable time (more comprensible for the graph axis)
StepsPerTime$time <- StepsPerTime$interval/100
# draw the line plot
#png('plot3.png')
h <- ggplot(StepsPerTime, aes(time, steps))
h+geom_line(col='violet')+ggtitle('Average steps per time interval')+xlab('Time')+ylab('Steps')+theme(plot.title = element_text(face='bold', size=15))
#dev.off()
library(dplyr)
# table for dplyr
ST <- tbl_df(StepsPerTime)
## Warning: `tbl_df()` is deprecated as of dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
# find the column
ST %>% select(time, steps) %>% filter(steps==max(ST$steps))
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 1 x 2
## time steps
## <dbl> <dbl>
## 1 8.35 206.
##Imputing missing values
# table for dplyr
ACT <- tbl_df(activity)
# find the column
ACT %>% filter(is.na(steps)) %>% summarize(missing_values = n())
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 1 x 1
## missing_values
## <int>
## 1 2304
# values without NA are imputed in a new column
activity$CompleteSteps <- ifelse(is.na(activity$steps), round(StepsPerTime$steps[match(activity$interval, StepsPerTime$interval)],0), activity$steps)
# new dataset activityFull
activityFull <- data.frame(steps=activity$CompleteSteps, interval=activity$interval, date=activity$date)
# see first 10 values of the new dataset
head(activityFull, 15)
## steps interval date
## 1 2 0 2012-10-01
## 2 0 5 2012-10-01
## 3 0 10 2012-10-01
## 4 0 15 2012-10-01
## 5 0 20 2012-10-01
## 6 2 25 2012-10-01
## 7 1 30 2012-10-01
## 8 1 35 2012-10-01
## 9 0 40 2012-10-01
## 10 1 45 2012-10-01
## 11 0 50 2012-10-01
## 12 0 55 2012-10-01
## 13 0 100 2012-10-01
## 14 1 105 2012-10-01
## 15 0 110 2012-10-01
StepsPerDayFull <- aggregate(activityFull$steps, list(activityFull$date), FUN=sum)
colnames(StepsPerDayFull) <- c('Date', 'Steps')
#png('plot4.png')
g <- ggplot(StepsPerDayFull, aes(Steps))
g+geom_histogram(boundary=0, binwidth=1500, col='violet', fill='blue')+ggtitle('Histogram of steps per day')+xlab('Steps')+ylab('Frequency')+theme(plot.title = element_text(face='bold', size=20))+scale_x_continuous(breaks=seq(0,25000,2500))+scale_y_continuous(breaks=seq(0,26,2))
#dev.off()
# Mean
mean(StepsPerDayFull$Steps)
## [1] 10765.64
#Median
median(StepsPerDayFull$Steps)
## [1] 10762
# Create variable with date in correct format
activityFull$RealDate <- as.Date(activityFull$date, format = '%Y-%m-%d')
# create a variable with weekdays name
activityFull$weekday <- weekdays(activityFull$RealDate)
# create a new variable indicating weekday or weekend
activityFull$DayType <- ifelse(activityFull$weekday=='Saturday' | activityFull$weekday=='Sunday', 'weekend','weekday')
# see first 10 values
head(activityFull, n=10)
## steps interval date RealDate weekday DayType
## 1 2 0 2012-10-01 2012-10-01 Monday weekday
## 2 0 5 2012-10-01 2012-10-01 Monday weekday
## 3 0 10 2012-10-01 2012-10-01 Monday weekday
## 4 0 15 2012-10-01 2012-10-01 Monday weekday
## 5 0 20 2012-10-01 2012-10-01 Monday weekday
## 6 2 25 2012-10-01 2012-10-01 Monday weekday
## 7 1 30 2012-10-01 2012-10-01 Monday weekday
## 8 1 35 2012-10-01 2012-10-01 Monday weekday
## 9 0 40 2012-10-01 2012-10-01 Monday weekday
## 10 1 45 2012-10-01 2012-10-01 Monday weekday
# create table with steps per time across weekdaydays or weekend days
StepsPerTimeDT <- aggregate(steps~interval+DayType,data=activityFull,FUN=mean,na.action=na.omit)
# variable time (more comprensible for the graph axis)
StepsPerTimeDT$time <- StepsPerTime$interval/100
# draw the line plot
#png('plot5.png')
j <- ggplot(StepsPerTimeDT, aes(time, steps))
j+geom_line(col='darkred')+ggtitle('Average steps per time interval: weekdays vs. weekends')+xlab('Time')+ylab('Steps')+theme(plot.title = element_text(face='bold', size=12))+facet_grid(DayType ~ .)
#dev.off()