Cleaning Data
Title: Standardized procedures to clean and organize data
Synopsis: This document is aimed at helping to clean and normalize a data set.
Replacing missing values
x = c(0,1,2,3,4,5,6,NA,NA)
w = c(0,1,2,3,4,5,6,NA,NA)
t = cbind(w,x)
y = data.frame(t)
# Replacing missing values for means
y[is.na(y$x)==TRUE,]$x = mean(y$x,na.rm=TRUE)
y$x
## [1] 0 1 2 3 4 5 6 3 3
Replacing outliers
# Load function replace.outliers
replace.outliers <- function(x){
quantiles <- quantile( x, c(.05, .95 ) )
x[ x < quantiles[1] ] <- quantiles[1]
x[ x > quantiles[2] ] <- quantiles[2]
x
}
# Create a dataset
x = rnorm(100) # Create a data set
x = c(21, 20, 25, x) # Introduce some outliers
quantiles <- quantile( x, c(.05, .95 ) ) # Calculates the quantiles for the 5th and 95th percentile values
quantiles[1] # this is the 5th percentile
## 5%
## -1.60599
quantiles[2] # this is the 95th percentile
## 95%
## 2.173251
# Finding the outliers
y = boxplot(x)
y$out
## [1] 21.000000 20.000000 25.000000 -2.221611 -3.447347 -2.229146
par(mfrow=c(1,2))
# Now you can run the function
# Before the replacement
boxplot(x, xlab="Before") # Before
x = replace.outliers(x)
# After the replacement
boxplot(x, xlab="After")
