Set the working directory and read in the data
wd <- getwd()
current_proj_dir <- "/Reproducible Research/RepData_PeerAssessment1"
wd1 <- paste0(wd, "/activity/activity.csv")
activity <- read.csv(wd1)
Check some stats about the data
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Factor w/ 61 levels "2012-10-01","2012-10-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
names(activity)
## [1] "steps" "date" "interval"
Convert dates to date objects
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
activity$date <- ymd(activity$date)
str(activity)
## 'data.frame': 17568 obs. of 3 variables:
## $ steps : int NA NA NA NA NA NA NA NA NA NA ...
## $ date : Date, format: "2012-10-01" "2012-10-01" ...
## $ interval: int 0 5 10 15 20 25 30 35 40 45 ...
head(activity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
Calculate the total number of steps taken per day
ttl_walks_by_day <- aggregate(activity["steps"], by=list(Date=activity$date), FUN=sum)
ttl_walks_by_day
## Date steps
## 1 2012-10-01 NA
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 NA
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## 11 2012-10-11 10304
## 12 2012-10-12 17382
## 13 2012-10-13 12426
## 14 2012-10-14 15098
## 15 2012-10-15 10139
## 16 2012-10-16 15084
## 17 2012-10-17 13452
## 18 2012-10-18 10056
## 19 2012-10-19 11829
## 20 2012-10-20 10395
## 21 2012-10-21 8821
## 22 2012-10-22 13460
## 23 2012-10-23 8918
## 24 2012-10-24 8355
## 25 2012-10-25 2492
## 26 2012-10-26 6778
## 27 2012-10-27 10119
## 28 2012-10-28 11458
## 29 2012-10-29 5018
## 30 2012-10-30 9819
## 31 2012-10-31 15414
## 32 2012-11-01 NA
## 33 2012-11-02 10600
## 34 2012-11-03 10571
## 35 2012-11-04 NA
## 36 2012-11-05 10439
## 37 2012-11-06 8334
## 38 2012-11-07 12883
## 39 2012-11-08 3219
## 40 2012-11-09 NA
## 41 2012-11-10 NA
## 42 2012-11-11 12608
## 43 2012-11-12 10765
## 44 2012-11-13 7336
## 45 2012-11-14 NA
## 46 2012-11-15 41
## 47 2012-11-16 5441
## 48 2012-11-17 14339
## 49 2012-11-18 15110
## 50 2012-11-19 8841
## 51 2012-11-20 4472
## 52 2012-11-21 12787
## 53 2012-11-22 20427
## 54 2012-11-23 21194
## 55 2012-11-24 14478
## 56 2012-11-25 11834
## 57 2012-11-26 11162
## 58 2012-11-27 13646
## 59 2012-11-28 10183
## 60 2012-11-29 7047
## 61 2012-11-30 NA
Make a histogram of the total number of steps taken each day
hist(ttl_walks_by_day$steps
, breaks = 20
, col = "pink"
, main = "Total Steps By Day"
,xlab = "Steps"
,labels = TRUE)
3a Calculate and report the mean of the total number of steps taken per day
daily_mean <- aggregate(activity["steps"], by=list(Date=activity$date), FUN=mean)
daily_mean
## Date steps
## 1 2012-10-01 NA
## 2 2012-10-02 0.4375000
## 3 2012-10-03 39.4166667
## 4 2012-10-04 42.0694444
## 5 2012-10-05 46.1597222
## 6 2012-10-06 53.5416667
## 7 2012-10-07 38.2465278
## 8 2012-10-08 NA
## 9 2012-10-09 44.4826389
## 10 2012-10-10 34.3750000
## 11 2012-10-11 35.7777778
## 12 2012-10-12 60.3541667
## 13 2012-10-13 43.1458333
## 14 2012-10-14 52.4236111
## 15 2012-10-15 35.2048611
## 16 2012-10-16 52.3750000
## 17 2012-10-17 46.7083333
## 18 2012-10-18 34.9166667
## 19 2012-10-19 41.0729167
## 20 2012-10-20 36.0937500
## 21 2012-10-21 30.6284722
## 22 2012-10-22 46.7361111
## 23 2012-10-23 30.9652778
## 24 2012-10-24 29.0104167
## 25 2012-10-25 8.6527778
## 26 2012-10-26 23.5347222
## 27 2012-10-27 35.1354167
## 28 2012-10-28 39.7847222
## 29 2012-10-29 17.4236111
## 30 2012-10-30 34.0937500
## 31 2012-10-31 53.5208333
## 32 2012-11-01 NA
## 33 2012-11-02 36.8055556
## 34 2012-11-03 36.7048611
## 35 2012-11-04 NA
## 36 2012-11-05 36.2465278
## 37 2012-11-06 28.9375000
## 38 2012-11-07 44.7326389
## 39 2012-11-08 11.1770833
## 40 2012-11-09 NA
## 41 2012-11-10 NA
## 42 2012-11-11 43.7777778
## 43 2012-11-12 37.3784722
## 44 2012-11-13 25.4722222
## 45 2012-11-14 NA
## 46 2012-11-15 0.1423611
## 47 2012-11-16 18.8923611
## 48 2012-11-17 49.7881944
## 49 2012-11-18 52.4652778
## 50 2012-11-19 30.6979167
## 51 2012-11-20 15.5277778
## 52 2012-11-21 44.3993056
## 53 2012-11-22 70.9270833
## 54 2012-11-23 73.5902778
## 55 2012-11-24 50.2708333
## 56 2012-11-25 41.0902778
## 57 2012-11-26 38.7569444
## 58 2012-11-27 47.3819444
## 59 2012-11-28 35.3576389
## 60 2012-11-29 24.4687500
## 61 2012-11-30 NA
3b Calculate and report the median of the total number of steps taken per day
daily_median <- aggregate(activity["steps"], by=list(Date=activity$date), FUN=median)
daily_median
## Date steps
## 1 2012-10-01 NA
## 2 2012-10-02 0
## 3 2012-10-03 0
## 4 2012-10-04 0
## 5 2012-10-05 0
## 6 2012-10-06 0
## 7 2012-10-07 0
## 8 2012-10-08 NA
## 9 2012-10-09 0
## 10 2012-10-10 0
## 11 2012-10-11 0
## 12 2012-10-12 0
## 13 2012-10-13 0
## 14 2012-10-14 0
## 15 2012-10-15 0
## 16 2012-10-16 0
## 17 2012-10-17 0
## 18 2012-10-18 0
## 19 2012-10-19 0
## 20 2012-10-20 0
## 21 2012-10-21 0
## 22 2012-10-22 0
## 23 2012-10-23 0
## 24 2012-10-24 0
## 25 2012-10-25 0
## 26 2012-10-26 0
## 27 2012-10-27 0
## 28 2012-10-28 0
## 29 2012-10-29 0
## 30 2012-10-30 0
## 31 2012-10-31 0
## 32 2012-11-01 NA
## 33 2012-11-02 0
## 34 2012-11-03 0
## 35 2012-11-04 NA
## 36 2012-11-05 0
## 37 2012-11-06 0
## 38 2012-11-07 0
## 39 2012-11-08 0
## 40 2012-11-09 NA
## 41 2012-11-10 NA
## 42 2012-11-11 0
## 43 2012-11-12 0
## 44 2012-11-13 0
## 45 2012-11-14 NA
## 46 2012-11-15 0
## 47 2012-11-16 0
## 48 2012-11-17 0
## 49 2012-11-18 0
## 50 2012-11-19 0
## 51 2012-11-20 0
## 52 2012-11-21 0
## 53 2012-11-22 0
## 54 2012-11-23 0
## 55 2012-11-24 0
## 56 2012-11-25 0
## 57 2012-11-26 0
## 58 2012-11-27 0
## 59 2012-11-28 0
## 60 2012-11-29 0
## 61 2012-11-30 NA
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:lubridate':
##
## intersect, setdiff, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
activity_new <- activity %>%
group_by(interval) %>%
summarise(AvgSteps = mean(steps, na.rm = TRUE))
plot(activity_new,
type ="l",
xlab= "5-minute interval",
ylab="averaged steps across all days",
main ="Average Daily Steps Over Each 5 Minute Interval",
col = "orange")
library(ggplot2)
ggplot(activity_new, aes(interval, AvgSteps,col = AvgSteps)) + geom_line() +
xlab("5-minute interval") + ylab("averaged steps across all days")
check if there are all NAs
sum(!is.na(activity_new$AvgSteps))
## [1] 288
activity_new$interval[which.max(activity_new$AvgSteps)]
## [1] 835
1 Calculate and report the total number of missing values in the dataset (i.e. the total number of rows with NAs)
impute MISSING VALUES with the interval average steps
activity_imputed <- activity %>%
group_by(interval) %>%
mutate(AvgSteps = ifelse( is.na(steps), mean(steps, na.rm = TRUE), steps))
#check if there are any NAs in the average
sum(is.na(activity_imputed$AvgSteps))
## [1] 0
4a. Calculate the total number of steps taken per day Make a histogram of the total number of steps taken each day
activity_imputed_ttl_daily_steps <- aggregate(activity_imputed["AvgSteps"], by=list(Date=activity_imputed$date), FUN=sum)
activity_imputed_ttl_daily_steps
## Date AvgSteps
## 1 2012-10-01 10766.19
## 2 2012-10-02 126.00
## 3 2012-10-03 11352.00
## 4 2012-10-04 12116.00
## 5 2012-10-05 13294.00
## 6 2012-10-06 15420.00
## 7 2012-10-07 11015.00
## 8 2012-10-08 10766.19
## 9 2012-10-09 12811.00
## 10 2012-10-10 9900.00
## 11 2012-10-11 10304.00
## 12 2012-10-12 17382.00
## 13 2012-10-13 12426.00
## 14 2012-10-14 15098.00
## 15 2012-10-15 10139.00
## 16 2012-10-16 15084.00
## 17 2012-10-17 13452.00
## 18 2012-10-18 10056.00
## 19 2012-10-19 11829.00
## 20 2012-10-20 10395.00
## 21 2012-10-21 8821.00
## 22 2012-10-22 13460.00
## 23 2012-10-23 8918.00
## 24 2012-10-24 8355.00
## 25 2012-10-25 2492.00
## 26 2012-10-26 6778.00
## 27 2012-10-27 10119.00
## 28 2012-10-28 11458.00
## 29 2012-10-29 5018.00
## 30 2012-10-30 9819.00
## 31 2012-10-31 15414.00
## 32 2012-11-01 10766.19
## 33 2012-11-02 10600.00
## 34 2012-11-03 10571.00
## 35 2012-11-04 10766.19
## 36 2012-11-05 10439.00
## 37 2012-11-06 8334.00
## 38 2012-11-07 12883.00
## 39 2012-11-08 3219.00
## 40 2012-11-09 10766.19
## 41 2012-11-10 10766.19
## 42 2012-11-11 12608.00
## 43 2012-11-12 10765.00
## 44 2012-11-13 7336.00
## 45 2012-11-14 10766.19
## 46 2012-11-15 41.00
## 47 2012-11-16 5441.00
## 48 2012-11-17 14339.00
## 49 2012-11-18 15110.00
## 50 2012-11-19 8841.00
## 51 2012-11-20 4472.00
## 52 2012-11-21 12787.00
## 53 2012-11-22 20427.00
## 54 2012-11-23 21194.00
## 55 2012-11-24 14478.00
## 56 2012-11-25 11834.00
## 57 2012-11-26 11162.00
## 58 2012-11-27 13646.00
## 59 2012-11-28 10183.00
## 60 2012-11-29 7047.00
## 61 2012-11-30 10766.19
#4bhistory gram
hist(activity_imputed_ttl_daily_steps$AvgSteps,
breaks = 20,
col="green",
main = "Total Steps By Day After inputing Missen values",
xlab = "Steps",
labels = T)
4c and Calculate and report the mean and median total number of steps taken per day. ###4ci mean
daily_mean_after_imputation <- aggregate(activity_imputed["AvgSteps"], by=list(Date=activity_imputed$date), FUN=mean)
daily_mean_after_imputation
## Date AvgSteps
## 1 2012-10-01 37.3825996
## 2 2012-10-02 0.4375000
## 3 2012-10-03 39.4166667
## 4 2012-10-04 42.0694444
## 5 2012-10-05 46.1597222
## 6 2012-10-06 53.5416667
## 7 2012-10-07 38.2465278
## 8 2012-10-08 37.3825996
## 9 2012-10-09 44.4826389
## 10 2012-10-10 34.3750000
## 11 2012-10-11 35.7777778
## 12 2012-10-12 60.3541667
## 13 2012-10-13 43.1458333
## 14 2012-10-14 52.4236111
## 15 2012-10-15 35.2048611
## 16 2012-10-16 52.3750000
## 17 2012-10-17 46.7083333
## 18 2012-10-18 34.9166667
## 19 2012-10-19 41.0729167
## 20 2012-10-20 36.0937500
## 21 2012-10-21 30.6284722
## 22 2012-10-22 46.7361111
## 23 2012-10-23 30.9652778
## 24 2012-10-24 29.0104167
## 25 2012-10-25 8.6527778
## 26 2012-10-26 23.5347222
## 27 2012-10-27 35.1354167
## 28 2012-10-28 39.7847222
## 29 2012-10-29 17.4236111
## 30 2012-10-30 34.0937500
## 31 2012-10-31 53.5208333
## 32 2012-11-01 37.3825996
## 33 2012-11-02 36.8055556
## 34 2012-11-03 36.7048611
## 35 2012-11-04 37.3825996
## 36 2012-11-05 36.2465278
## 37 2012-11-06 28.9375000
## 38 2012-11-07 44.7326389
## 39 2012-11-08 11.1770833
## 40 2012-11-09 37.3825996
## 41 2012-11-10 37.3825996
## 42 2012-11-11 43.7777778
## 43 2012-11-12 37.3784722
## 44 2012-11-13 25.4722222
## 45 2012-11-14 37.3825996
## 46 2012-11-15 0.1423611
## 47 2012-11-16 18.8923611
## 48 2012-11-17 49.7881944
## 49 2012-11-18 52.4652778
## 50 2012-11-19 30.6979167
## 51 2012-11-20 15.5277778
## 52 2012-11-21 44.3993056
## 53 2012-11-22 70.9270833
## 54 2012-11-23 73.5902778
## 55 2012-11-24 50.2708333
## 56 2012-11-25 41.0902778
## 57 2012-11-26 38.7569444
## 58 2012-11-27 47.3819444
## 59 2012-11-28 35.3576389
## 60 2012-11-29 24.4687500
## 61 2012-11-30 37.3825996
daily_median_imputation <- aggregate(activity_imputed["AvgSteps"], by=list(Date=activity_imputed$date), FUN=median)
daily_median_imputation
## Date AvgSteps
## 1 2012-10-01 34.11321
## 2 2012-10-02 0.00000
## 3 2012-10-03 0.00000
## 4 2012-10-04 0.00000
## 5 2012-10-05 0.00000
## 6 2012-10-06 0.00000
## 7 2012-10-07 0.00000
## 8 2012-10-08 34.11321
## 9 2012-10-09 0.00000
## 10 2012-10-10 0.00000
## 11 2012-10-11 0.00000
## 12 2012-10-12 0.00000
## 13 2012-10-13 0.00000
## 14 2012-10-14 0.00000
## 15 2012-10-15 0.00000
## 16 2012-10-16 0.00000
## 17 2012-10-17 0.00000
## 18 2012-10-18 0.00000
## 19 2012-10-19 0.00000
## 20 2012-10-20 0.00000
## 21 2012-10-21 0.00000
## 22 2012-10-22 0.00000
## 23 2012-10-23 0.00000
## 24 2012-10-24 0.00000
## 25 2012-10-25 0.00000
## 26 2012-10-26 0.00000
## 27 2012-10-27 0.00000
## 28 2012-10-28 0.00000
## 29 2012-10-29 0.00000
## 30 2012-10-30 0.00000
## 31 2012-10-31 0.00000
## 32 2012-11-01 34.11321
## 33 2012-11-02 0.00000
## 34 2012-11-03 0.00000
## 35 2012-11-04 34.11321
## 36 2012-11-05 0.00000
## 37 2012-11-06 0.00000
## 38 2012-11-07 0.00000
## 39 2012-11-08 0.00000
## 40 2012-11-09 34.11321
## 41 2012-11-10 34.11321
## 42 2012-11-11 0.00000
## 43 2012-11-12 0.00000
## 44 2012-11-13 0.00000
## 45 2012-11-14 34.11321
## 46 2012-11-15 0.00000
## 47 2012-11-16 0.00000
## 48 2012-11-17 0.00000
## 49 2012-11-18 0.00000
## 50 2012-11-19 0.00000
## 51 2012-11-20 0.00000
## 52 2012-11-21 0.00000
## 53 2012-11-22 0.00000
## 54 2012-11-23 0.00000
## 55 2012-11-24 0.00000
## 56 2012-11-25 0.00000
## 57 2012-11-26 0.00000
## 58 2012-11-27 0.00000
## 59 2012-11-28 0.00000
## 60 2012-11-29 0.00000
## 61 2012-11-30 34.11321
ANS: yes, they do, the missen values are well filled out which impacts the distribution of the steps
ANS: it makes it more clear that on average more steps were taken which counted around between 10000 and 15000
activity_imputed_wkdays <- activity_imputed %>%
mutate(wkday = as.factor(ifelse( weekdays(date) %in% c("Saturday", "Sunday"), "weekend","weekday")))
table(activity_imputed$wkday)
## Warning: Unknown or uninitialised column: 'wkday'.
## < table of extent 0 >
g <- ggplot(data = activity_imputed_wkdays, aes(x= interval, y=AvgSteps, col = wkday) ) + geom_line()
g + facet_grid(wkday ~ .)
ANS: Yes. On average, the are more steps over the 600 steps during weekdays than there is during the weekend.