knitr Global Options

# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')

Load Libraries

library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Project Activity Analysis

OBJECTIVE

This primary objective of this project to check the following skills of the participants attending Business Analytics Using R workshop
1. Basic R Concepts
2. Reading A File / Writing A File
3. Data Imputation

PROBLEM DEFINITION

1.To read the data file and devising a strategy for imputing all of the missing values in the dataset.
2.Summarise and Report the data

To Read Data

# inline comments
setwd("E:/R-BA/scripts")
CSVdfActivity <- read.csv("activity.csv", header=T, stringsAsFactors=F)
nrow(CSVdfActivity)
## [1] 17568
head(CSVdfActivity)
##   steps       date interval
## 1    NA 2012-10-01        0
## 2    NA 2012-10-01        5
## 3    NA 2012-10-01       10
## 4    NA 2012-10-01       15
## 5    NA 2012-10-01       20
## 6    NA 2012-10-01       25

To calculate number of missing values

# inline comments
dfNA <-sapply(CSVdfActivity,function(x) sum(is.na(x)))
dfNA <-as.data.frame(dfNA)
head(dfNA)
##          dfNA
## steps    2304
## date        0
## interval    0

To Remove all records with NA in any columns

# inline comments
vclComplete <- complete.cases(CSVdfActivity)
vclComplete[is.true(vclComplete)]
## Error in eval(expr, envir, enclos): could not find function "is.true"
CSVdf1Activity <- CSVdfActivity[vclComplete, ]
nrow(CSVdf1Activity)
## [1] 15264

To calculate mean of available values

CSVdf2Activity <- subset(CSVdfActivity, steps !="NA")
CSVdf2Activity <- summarise (group_by(CSVdf2Activity, interval), steps=mean(steps))
head(CSVdf2Activity)
## # A tibble: 6 × 2
##   interval     steps
##      <int>     <dbl>
## 1        0 1.7169811
## 2        5 0.3396226
## 3       10 0.1320755
## 4       15 0.1509434
## 5       20 0.0754717
## 6       25 2.0943396

To Replace the NA values with the mean calculated

replace_na <- function(step, interval) 
{
  ifelse(is.na(step), CSVdf2Activity[CSVdf2Activity$interval == interval, ]$steps, step)
}
CSVdfActivitynew <- CSVdfActivity
CSVdfActivitynew$steps <- mapply(replace_na, CSVdfActivitynew$steps, CSVdfActivitynew$interval)
head(CSVdfActivitynew)
##       steps       date interval
## 1 1.7169811 2012-10-01        0
## 2 0.3396226 2012-10-01        5
## 3 0.1320755 2012-10-01       10
## 4 0.1509434 2012-10-01       15
## 5 0.0754717 2012-10-01       20
## 6 2.0943396 2012-10-01       25

To Create a new dataset with the calculated values

write.csv(CSVdfActivitynew, file = "Activitynew.csv")

SUMMARY

The file contained 17569 values which had 2304 NA’s before imputation process which was filtered in R using the code two data frames were created.

Original CSVdfActivity file had means of only some values which by using group by means of all MA values were found out.Then using replace all NA values were substituted and new data frame was created which had mean of all NA values