suppressMessages(library(dplyr))
suppressWarnings(library(ggplot2))
setwd("C:/Users/User/Desktop")
df <- read.csv("activity.csv", stringsAsFactors = F)
stepsday <- df %>% group_by(date) %>% summarise(sum = sum(steps, na.rm = T))
stepsday
## # A tibble: 61 x 2
## date sum
## <chr> <int>
## 1 2012-10-01 0
## 2 2012-10-02 126
## 3 2012-10-03 11352
## 4 2012-10-04 12116
## 5 2012-10-05 13294
## 6 2012-10-06 15420
## 7 2012-10-07 11015
## 8 2012-10-08 0
## 9 2012-10-09 12811
## 10 2012-10-10 9900
## # ... with 51 more rows
stepsday.v <- as.numeric(as.data.frame(stepsday)[,2])
hist(stepsday.v, breaks = 30)
mean1 <- format(mean(stepsday.v) )
median1 <- format(median(stepsday.v))
mean1
## [1] "9354.23"
median1
## [1] "10395"
The mean and median of the total number of steps taken per day are 9354.23 and 10395 respectively.
stepsavg <- df %>% group_by(interval) %>% summarise(avg = mean(steps, na.rm = T))
plot(x=stepsavg$interval, y = stepsavg$avg, type = "l")
maxint <- stepsavg$interval[stepsavg$avg == max(stepsavg$avg)]
maxint
## [1] 835
The 835th 5-minute interval contains the maximum number of steps.
sumna <- sum(is.na(df$steps))
sumna
## [1] 2304
The total number of missing values in the dataset is 2304.
First, create a duplicate dataset of the original dataset.
Then, replace each missing value with the mean of the total number of steps taken in that respective 5-minute interval across the dataset.
df2 <- df
uniqueinterval.v <- unique(df2$interval)
stepsavg.df <- as.data.frame(stepsavg)
for(x in uniqueinterval.v){
u1 <- is.na(df2$steps)
u2 <- df2$interval %in% x
df2[u1&u2, "steps"] <- stepsavg.df[stepsavg.df$interval == x, "avg"]
}
The new dataset is created.
head(df2)
## steps date interval
## 1 1.7169811 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.0943396 2012-10-01 25
stepsday2 <- df2 %>% group_by(date) %>% summarise(sum = sum(steps, na.rm = T))
stepsday2.v <- as.numeric(as.data.frame(stepsday2)[,2])
hist(stepsday2.v, breaks = 30)
mean2 <- format(mean(stepsday2.v))
median2 <- format(median(stepsday2.v))
mean2
## [1] "10766.19"
median2
## [1] "10766.19"
The new mean and median of the total number of steps taken per day are 10766.19 and 10766.19 respectively. The new mean and median is slightly higher than the mean and median computed earlier (9354.23 and 10395 respectively). Hence, the estimates might be higher using the new dataset.
df3 <- df2 %>% mutate(day1 = weekdays(as.Date(date)), wd = weekdays(as.Date(date)) %in% c("Saturday", "Sunday") ) %>% mutate(factorwk = factor(wd, c(FALSE, TRUE), labels = c("weekday", "weekend")))
head(df3)
## steps date interval day1 wd factorwk
## 1 1.7169811 2012-10-01 0 Monday FALSE weekday
## 2 0.3396226 2012-10-01 5 Monday FALSE weekday
## 3 0.1320755 2012-10-01 10 Monday FALSE weekday
## 4 0.1509434 2012-10-01 15 Monday FALSE weekday
## 5 0.0754717 2012-10-01 20 Monday FALSE weekday
## 6 2.0943396 2012-10-01 25 Monday FALSE weekday
tail(df3)
## steps date interval day1 wd factorwk
## 17563 2.6037736 2012-11-30 2330 Friday FALSE weekday
## 17564 4.6981132 2012-11-30 2335 Friday FALSE weekday
## 17565 3.3018868 2012-11-30 2340 Friday FALSE weekday
## 17566 0.6415094 2012-11-30 2345 Friday FALSE weekday
## 17567 0.2264151 2012-11-30 2350 Friday FALSE weekday
## 17568 1.0754717 2012-11-30 2355 Friday FALSE weekday
df4 <- df3 %>% group_by(factorwk, interval) %>% summarise(msteps = mean(steps))
ggplot(data = df4, mapping = aes(x = interval, y = msteps)) + geom_line() + facet_grid(factorwk~.) + ylab("Average number of steps taken")