options(scipen=999)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data <- read.csv(file = "activity.csv", header = TRUE)
dailysteps <- data %>% group_by(date) %>% summarise(sum(steps, na.rm = TRUE))
names(dailysteps) = c("date", "daily.steps")
hist(dailysteps$daily.steps, breaks = 15, xlab = "daily steps", main = "Histogram of the average daily steps")
average <- round(mean(dailysteps$daily.steps), digits = 0)
average
## [1] 9354
The mean total number of steps taken per day is 9354
median <- median(dailysteps$daily.steps)
median
## [1] 10395
The mean total number of steps taken per day is 10395
intervallsteps <- data %>% group_by(interval) %>% summarise(mean(steps, na.rm = TRUE))
names(intervallsteps) = c("interval", "av.daily.steps")
plot(intervallsteps$interval, intervallsteps$av.daily.steps, type = "l", xlab = "time interval", ylab = "average number of steps")
maxtime <- intervallsteps[which.max(intervallsteps$av.daily.steps), 1]
maxtime
## # A tibble: 1 x 1
## interval
## <int>
## 1 835
The intervall at 835 has the maximum average daily steps
Weโre going to fill the missing values with the mean of the correspondent interval
newdata <- mutate(data, missing.data = (is.na(steps)| is.na(date)))
newdata <- full_join(newdata, intervallsteps, by = "interval")
newdata[is.na(newdata)] <- 0
newdata <- mutate(newdata, new.steps = steps + (missing.data * av.daily.steps))
newdata <- select(newdata, date, interval, new.steps)
newdailysteps <- newdata %>% group_by(date) %>% summarise(sum(new.steps, na.rm = TRUE))
names(newdailysteps) = c("date", "daily.steps")
hist(newdailysteps$daily.steps, breaks = 15, xlab = "daily steps", main = "Histogram of the average daily steps after imputing the missing values")
We can compare the 2 histograms
par(mfrow=c(1,2))
hist(dailysteps$daily.steps, breaks = 15, xlab = "daily steps", main = "omitt NA")
hist(newdailysteps$daily.steps, breaks = 15, xlab = "daily steps", main = "Replace NA")
As for the numerical values:
newaverage <- round(mean(newdailysteps$daily.steps), digits = 0)
newaverage
## [1] 10766
newmedian <- median(newdailysteps$daily.steps)
newmedian
## [1] 10766.19
avdiff <- newaverage - average
meandiff <- newmedian - median
The new average is 10766 which differs by 1412 to before
The new median is 10766.1886792 which differs by 371.1886792 to before
As we can see the mean differs quite a lot the median in this example not at all. This is because the median is much more robust than the mean. But in general one has to be careful when replacing NA-Values, because it can change the data a lot.
newdata <- mutate(newdata, weekday = weekdays(as.Date(date, format = "%Y-%m-%d")))
weekends <- filter(newdata, weekday == "Saturday" | weekday =="Sunday")
noweekends <- filter(newdata, weekday =="Monday" | weekday == "Tuesday" | weekday=="Wednesday" | weekday == "Thursday" | weekday == "Friday")
intervallstepsweekend <- weekends %>% group_by(interval) %>% summarise(mean(new.steps))
names(intervallstepsweekend) = c("interval", "av.daily.steps")
intervallstepsweekday <- noweekends %>% group_by(interval) %>% summarise(mean(new.steps))
names(intervallstepsweekday) = c("interval", "av.daily.steps")
Now we can make a panel plot comparing the two timeseries:
par(mfrow=c(2,1), mar = c(2,4,1.5,1.5))
plot(intervallstepsweekend$interval, intervallstepsweekend$av.daily.steps, type = "l", xlab = "time interval", ylab = "average number of steps")
legend("topleft", legend = "weekend")
plot(intervallstepsweekday$interval, intervallstepsweekday$av.daily.steps, type = "l", xlab = "time interval", ylab = "average number of steps", )
legend("topleft", legend = "weekday")