Analysis of activity-data

Objective: To replace all the missing values in the activity dataset with mean using data imputation

Problem definition: To analyse the problem -file using rmd,to devise a strategy to imputate missing values and to summarize the results

working dir

setwd("D:/R-BA/R-Scripts/data")

**** Activity1.rmd

knitr Global Options

# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')
load libraries
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

read activit

dfrActivity <- read.csv("D:/R-BA/R-Scripts/data/activity.csv", header=T, stringsAsFactors=F)
#dfrActivity <- data.table(dfrActivity)
nrow(dfrActivity)
## [1] 17568
head(dfrActivity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

Number of NA records before data imputing process.

dfrNAs<- sapply(dfrActivity, function(x) sum(is.na(x)))
dfrNAs<- as.data.frame(dfrNAs)
class(dfrNAs)
## [1] "data.frame"
head(dfrNAs)
##          dfrNAs
## steps      2304
## date          0
## interval      0

to find out mean according to Interval

dfrMean <- subset(dfrActivity, steps !="NA")
dfrMean <- summarise(group_by(dfrMean, interval), steps=mean(steps))
head(dfrMean)
## # A tibble: 6 × 2
##   interval     steps
##      <int>     <dbl>
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396

Data Imputation process

replace_na <- function(step, interval) {
  ifelse(is.na(step), dfrMean[dfrMean$interval == interval, ]$steps, step)
}
head(dfrMean)
## # A tibble: 6 × 2
##   interval     steps
##      <int>     <dbl>
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396

Replacing NA values

dfrActivityNew <- dfrActivity
dfrActivity$steps <- mapply(replace_na, dfrActivityNew$steps, dfrActivityNew$interval)
View(dfrActivityNew)

checking for NA values

dfrNAs2<- sapply(dfrActivityNew, function(x) sum(is.na(x)))
dfrNAs2<- as.data.frame(dfrNAs2)
class(dfrNAs2)
## [1] "data.frame"
View(dfrNAs2)

writing changes to file

write.csv(dfrActivityNew,"CorrectedActivityData.csv",row.names=F)
file.show("CorrectedActivityData.csv")