Probem Definition

To use mean substitution method to replace all the missing values in the given dataset.

Code & Output

knitr Options

knitr::opts_chunk$set(echo = TRUE)

Load Libraries

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Read Data

dfrActivity <- read.csv("/Users/charu/Desktop/R/R-BA/R-Scripts/data/activity.csv", header=T, stringsAsFactors=F)
head(dfrActivity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

No. of NA’s before imputation

sum(is.na(dfrActivity))
## [1] 2304

Calculating means for intervals

dfrMean <- summarise(group_by(dfrActivity, interval), mean(steps, na.rm = T))
colnames(dfrMean) <- c("interval", "mean")
head(dfrMean)
## # A tibble: 6 <U+00D7> 2
##   interval      mean
##      <int>     <dbl>
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396

Data Imputation

impute <- function(step, interval) {
  ifelse(is.na(step), dfrMean[dfrMean$interval == interval, ]$mean, step)
}
dfrActivity$steps <- mapply(impute, dfrActivity$steps, dfrActivity$interval)
head(dfrActivity)
##       steps       date interval
## 1 1.7169811 2012-10-01        0
## 2 0.3396226 2012-10-01        5
## 3 0.1320755 2012-10-01       10
## 4 0.1509434 2012-10-01       15
## 5 0.0754717 2012-10-01       20
## 6 2.0943396 2012-10-01       25

No. of NA’s after imputation

sum(is.na(dfrActivity))
## [1] 0