To use mean substitution method to replace all the missing values in the given dataset.
knitr Options
knitr::opts_chunk$set(echo = TRUE)
Load Libraries
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Read Data
dfrActivity <- read.csv("/Users/charu/Desktop/R/R-BA/R-Scripts/data/activity.csv", header=T, stringsAsFactors=F)
head(dfrActivity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
No. of NA’s before imputation
sum(is.na(dfrActivity))
## [1] 2304
Calculating means for intervals
dfrMean <- summarise(group_by(dfrActivity, interval), mean(steps, na.rm = T))
colnames(dfrMean) <- c("interval", "mean")
head(dfrMean)
## # A tibble: 6 <U+00D7> 2
## interval mean
## <int> <dbl>
## 1 0 1.7169811
## 2 5 0.3396226
## 3 10 0.1320755
## 4 15 0.1509434
## 5 20 0.0754717
## 6 25 2.0943396
Data Imputation
impute <- function(step, interval) {
ifelse(is.na(step), dfrMean[dfrMean$interval == interval, ]$mean, step)
}
dfrActivity$steps <- mapply(impute, dfrActivity$steps, dfrActivity$interval)
head(dfrActivity)
## steps date interval
## 1 1.7169811 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.0943396 2012-10-01 25
No. of NA’s after imputation
sum(is.na(dfrActivity))
## [1] 0