d <- read.csv("C:/REPO/datasciencecoursera/5.Reproducible_Research/activity.csv")
## d <- read.csv("./5.Reproducible_Research/activity.csv")
dat <- na.omit(d$steps)
mean(dat)
## [1] 37.3826
The total number of steps taken per day is 37.3826.
hist(dat,
main = "Histogram of Steps taken per day",
xlab = "steps")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
df <- d %>% filter(!is.na(steps))
df$date2 <- as.Date(df$date)
dt <- df %>%
group_by(date2) %>%
summarise(avg_steps = mean(steps),
median_steps = median(steps))
head(dt)
## # A tibble: 6 x 3
## date2 avg_steps median_steps
## <date> <dbl> <dbl>
## 1 2012-10-02 0.438 0
## 2 2012-10-03 39.4 0
## 3 2012-10-04 42.1 0
## 4 2012-10-05 46.2 0
## 5 2012-10-06 53.5 0
## 6 2012-10-07 38.2 0
plot(dt$date2,dt$avg_steps,
type="l",
main = "Average Daily Activity Steps",
xlab="Date",
ylab="Steps")
sum(is.na(d$steps))
## [1] 2304
## use the mean to replace the missing values
library(plyr);library(dplyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
impute.med <- function(x)replace(x, is.na(x), median(x, na.rm = TRUE))
## plyr method
## d2 <- ddply(d, ~ date, transform, steps = impute.mean(steps))
## dplyr method
d2 <- d %>%
group_by(date) %>%
mutate(
steps = impute.med(steps)
)
hist(d2$steps)
detach(package:plyr)
d2 <- data.frame(d2)
#d2$date <- as.Date(d2$date)
dt2 <- d2 %>%
group_by(date) %>%
summarise(avg_steps = mean(steps),
median_steps = median(steps))
dt2
## # A tibble: 61 x 3
## date avg_steps median_steps
## <chr> <dbl> <dbl>
## 1 2012-10-01 0 0
## 2 2012-10-02 0.438 0
## 3 2012-10-03 39.4 0
## 4 2012-10-04 42.1 0
## 5 2012-10-05 46.2 0
## 6 2012-10-06 53.5 0
## 7 2012-10-07 38.2 0
## 8 2012-10-08 0 0
## 9 2012-10-09 44.5 0
## 10 2012-10-10 34.4 0
## # ... with 51 more rows
dt2$date <- as.Date(dt2$date)
plot(dt2$date,dt2$avg_steps,
type="l",
main = "Average Daily Activity Steps",
xlab="Date",
ylab="Steps")
d2$date <- as.Date(d2$date)
d2 <- d2 %>% mutate(wday = weekdays(date),
weekends = ifelse(wday %in% c("Saturday","Sunday" ),1,0))
table(d2$weekends)
##
## 0 1
## 12960 4608
library(dplyr)
d2 %>% group_by(weekends) %>%
summarise(mean_steps = mean(steps),
median_steps = median(steps))
## # A tibble: 2 x 3
## weekends mean_steps median_steps
## <dbl> <dbl> <dbl>
## 1 0 30.6 0
## 2 1 37.7 0
The median steps are the same, but we could infer from the mean that people generally walk more during weekends.
library(ggplot2)
d2$weekends = factor(d2$weekends)
ggplot(d2,aes(x = weekends, y= steps))+
geom_boxplot() +
stat_boxplot(geom = "errorbar",
width = 0.25)