knitr Global Options
# for development
knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=TRUE, warning=TRUE, message=TRUE, cache=FALSE, tidy=FALSE, fig.path='figures/')
# for production
#knitr::opts_chunk$set(echo=TRUE, eval=TRUE, error=FALSE, warning=FALSE, message=FALSE, cache=FALSE, tidy=FALSE, fig.path='figures/')
Load Libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
This primary objective of this project to check the following skills of the participants attending Business Analytics Using R workshop
1. Basic R Concepts
2. Reading A File / Writing A File
3. Data Imputation
1.To read the data file and devising a strategy for imputing all of the missing values in the dataset.
2.Summarise and Report the data
To Read Data
# inline comments
setwd("E:/R-BA/scripts")
CSVdfActivity <- read.csv("activity.csv", header=T, stringsAsFactors=F)
nrow(CSVdfActivity)
## [1] 17568
head(CSVdfActivity)
## steps date interval
## 1 NA 2012-10-01 0
## 2 NA 2012-10-01 5
## 3 NA 2012-10-01 10
## 4 NA 2012-10-01 15
## 5 NA 2012-10-01 20
## 6 NA 2012-10-01 25
To calculate number of missing values
# inline comments
dfNA <-sapply(CSVdfActivity,function(x) sum(is.na(x)))
dfNA <-as.data.frame(dfNA)
head(dfNA)
## dfNA
## steps 2304
## date 0
## interval 0
To Remove all records with NA in any columns
# inline comments
vclComplete <- complete.cases(CSVdfActivity)
vclComplete[is.true(vclComplete)]
## Error in eval(expr, envir, enclos): could not find function "is.true"
CSVdf1Activity <- CSVdfActivity[vclComplete, ]
nrow(CSVdf1Activity)
## [1] 15264
To calculate mean of available values
CSVdf2Activity <- subset(CSVdfActivity, steps !="NA")
CSVdf2Activity <- summarise (group_by(CSVdf2Activity, interval), steps=mean(steps))
head(CSVdf2Activity)
## # A tibble: 6 × 2
## interval steps
## <int> <dbl>
## 1 0 1.7169811
## 2 5 0.3396226
## 3 10 0.1320755
## 4 15 0.1509434
## 5 20 0.0754717
## 6 25 2.0943396
To Replace the NA values with the mean calculated
replace_na <- function(step, interval)
{
ifelse(is.na(step), CSVdf2Activity[CSVdf2Activity$interval == interval, ]$steps, step)
}
CSVdfActivitynew <- CSVdfActivity
CSVdfActivitynew$steps <- mapply(replace_na, CSVdfActivitynew$steps, CSVdfActivitynew$interval)
head(CSVdfActivitynew)
## steps date interval
## 1 1.7169811 2012-10-01 0
## 2 0.3396226 2012-10-01 5
## 3 0.1320755 2012-10-01 10
## 4 0.1509434 2012-10-01 15
## 5 0.0754717 2012-10-01 20
## 6 2.0943396 2012-10-01 25
To Create a new dataset with the calculated values
write.csv(CSVdfActivitynew, file = "Activitynew.csv")
The file contained 17569 values which had 2304 NA’s before imputation process which was filtered in R using the code two data frames were created.
Original CSVdfActivity file had means of only some values which by using group by means of all MA values were found out.Then using replace all NA values were substituted and new data frame was created which had mean of all NA values