working dir
setwd("D:/R-BA/R-Scripts")
activity1.rmd
knitr Global Options
# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')
Load Libraries
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
reading the file, activity.csv
dfrActivity <- read.csv("./data/activity1.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrActivity)
print(intRowCount)
## [1] 17568
head(dfrActivity,20)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
## 7 NA 2012-10-01 30
## 8 NA 2012-10-01 35
## 9 NA 2012-10-01 40
## 10 NA 2012-10-01 45
## 11 NA 2012-10-01 50
## 12 NA 2012-10-01 55
## 13 NA 2012-10-01 100
## 14 NA 2012-10-01 105
## 15 NA 2012-10-01 110
## 16 NA 2012-10-01 115
## 17 NA 2012-10-01 120
## 18 NA 2012-10-01 125
## 19 NA 2012-10-01 130
## 20 NA 2012-10-01 135
finding out the number of NA values in steps column before imputation process
NADataFrame <- dfrActivity[is.na(dfrActivity$steps),]
summarise(group_by(NADataFrame,steps),n())
## # A tibble: 1 × 2
## steps `n()`
## <int> <int>
## 1 NA 2304
finding mean by using grouping by with interval
head(dfrActivity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
dfrStepsMean <- summarise(group_by(dfrActivity, interval), mean(steps,na.rm=TRUE))
names(dfrStepsMean)[2] <- "StepsMean"
dfrStepsMean$StepsMean <- ceiling(dfrStepsMean$StepsMean)
head(dfrStepsMean)
## # A tibble: 6 × 2
## interval StepsMean
## <int> <dbl>
## 1 0 2
## 2 5 1
## 3 10 1
## 4 15 1
## 5 20 1
## 6 25 3
Replacing NA values in steps with char value
NAValues <- which(is.na(dfrActivity$steps))
dfrActivity$steps[NAValues] <- "NA"
head(dfrActivity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
class(dfrActivity$steps)
## [1] "character"
creating a join
dfrInnerJoin <- inner_join(dfrActivity, dfrStepsMean)
## Joining, by = "interval"
View(dfrInnerJoin)
using UDF and mapply function
getValue <- function(psteps,pStepsMean){
StepsValue <- ifelse(psteps == "NA",pStepsMean,psteps)
return(StepsValue)
}
dfrInnerJoin$steps <- mapply(getValue,dfrInnerJoin$steps,dfrInnerJoin$StepsMean)
class(dfrInnerJoin$steps)
## [1] "character"
dfrInnerJoin$steps <- as.numeric(dfrInnerJoin$steps)
head(dfrInnerJoin)
## steps date interval StepsMean
## 1 2 2012-10-01 0 2
## 2 1 2012-10-01 5 1
## 3 1 2012-10-01 10 1
## 4 1 2012-10-01 15 1
## 5 1 2012-10-01 20 1
## 6 3 2012-10-01 25 3
dfrImputedData <- select(dfrInnerJoin, -StepsMean)
head(dfrImputedData)
## steps date interval
## 1 2 2012-10-01 0
## 2 1 2012-10-01 5
## 3 1 2012-10-01 10
## 4 1 2012-10-01 15
## 5 1 2012-10-01 20
## 6 3 2012-10-01 25
checking for NA values
NADataFrame <- dfrActivity[is.na(dfrActivity$steps),]
summarise(group_by(NADataFrame,steps),n())
## # A tibble: 0 × 2
## # ... with 2 variables: steps <chr>, n() <int>
writing changes to file
write.csv(dfrImputedActivityData,"ModifiedActivityData.csv",row.names=F)
## Error in is.data.frame(x): object 'dfrImputedActivityData' not found
file.show("ModifiedActivityData.csv")