Assignment no.4

Sayli Gharat

April 4,2017

Analysis of activity-data

Objective: To replace all the missing values in the activity dataset with mean using data imputation

Problem definition: To analyse the problem -file using rmd,to devise a strategy to imputate missing values and to summarize the results

Overview : Data captured by a personal activity monitoring device is given, which has three columns namely, steps,date & interval.Data is captured at an interval of 5 minutes .However there are some NA values in the ‘steps’ column. So, the main aim of the project is to replace NA with suitable mean for that interval

working dir

setwd("D:/R-BA/R-Scripts")

activity1.rmd

knitr Global Options

# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')

Load Libraries

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

reading the file, activity.csv

dfrActivity <- read.csv("./data/activity1.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrActivity)
print(intRowCount)
## [1] 17568
head(dfrActivity,20)
##    steps       date interval
## 1     NA 2012-10-01        0
## 2     NA 2012-10-01        5
## 3     NA 2012-10-01       10
## 4     NA 2012-10-01       15
## 5     NA 2012-10-01       20
## 6     NA 2012-10-01       25
## 7     NA 2012-10-01       30
## 8     NA 2012-10-01       35
## 9     NA 2012-10-01       40
## 10    NA 2012-10-01       45
## 11    NA 2012-10-01       50
## 12    NA 2012-10-01       55
## 13    NA 2012-10-01      100
## 14    NA 2012-10-01      105
## 15    NA 2012-10-01      110
## 16    NA 2012-10-01      115
## 17    NA 2012-10-01      120
## 18    NA 2012-10-01      125
## 19    NA 2012-10-01      130
## 20    NA 2012-10-01      135

finding out the number of NA values in steps column before imputation process

NADataFrame <- dfrActivity[is.na(dfrActivity$steps),]
summarise(group_by(NADataFrame,steps),n())
## # A tibble: 1 × 2
##   steps `n()`
##   <int> <int>
## 1    NA  2304

finding mean by using grouping by with interval

head(dfrActivity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
dfrStepsMean <- summarise(group_by(dfrActivity, interval), mean(steps,na.rm=TRUE))
names(dfrStepsMean)[2] <- "StepsMean"
dfrStepsMean$StepsMean <- ceiling(dfrStepsMean$StepsMean)
head(dfrStepsMean)
## # A tibble: 6 × 2
##   interval StepsMean
##      <int>     <dbl>
## 1        0         2
## 2        5         1
## 3       10         1
## 4       15         1
## 5       20         1
## 6       25         3

Replacing NA values in steps with char value

NAValues <- which(is.na(dfrActivity$steps))
dfrActivity$steps[NAValues] <- "NA"
head(dfrActivity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25
class(dfrActivity$steps)
## [1] "character"

creating a join

dfrInnerJoin <- inner_join(dfrActivity, dfrStepsMean)
## Joining, by = "interval"
View(dfrInnerJoin)

using UDF and mapply function

getValue <- function(psteps,pStepsMean){
  StepsValue <- ifelse(psteps == "NA",pStepsMean,psteps)
  return(StepsValue)
}
dfrInnerJoin$steps <- mapply(getValue,dfrInnerJoin$steps,dfrInnerJoin$StepsMean)
class(dfrInnerJoin$steps)
## [1] "character"
dfrInnerJoin$steps <- as.numeric(dfrInnerJoin$steps)
head(dfrInnerJoin)
##   steps       date interval StepsMean
## 1     2 2012-10-01        0         2
## 2     1 2012-10-01        5         1
## 3     1 2012-10-01       10         1
## 4     1 2012-10-01       15         1
## 5     1 2012-10-01       20         1
## 6     3 2012-10-01       25         3
dfrImputedData <- select(dfrInnerJoin, -StepsMean) 
head(dfrImputedData)
##   steps       date interval
## 1     2 2012-10-01        0
## 2     1 2012-10-01        5
## 3     1 2012-10-01       10
## 4     1 2012-10-01       15
## 5     1 2012-10-01       20
## 6     3 2012-10-01       25

checking for NA values

NADataFrame <- dfrActivity[is.na(dfrActivity$steps),]
summarise(group_by(NADataFrame,steps),n())
## # A tibble: 0 × 2
## # ... with 2 variables: steps <chr>, n() <int>

writing changes to file

write.csv(dfrImputedActivityData,"ModifiedActivityData.csv",row.names=F)
## Error in is.data.frame(x): object 'dfrImputedActivityData' not found
file.show("ModifiedActivityData.csv")

Summary Report

  1. Number of NA records before data imputing process was 2304 out of 17568 total records.
  2. No records have NA values after imputing the data.