Cleaning Data

Title: Standardized procedures to clean and organize data

Synopsis: This document is aimed at helping to clean and normalize a data set.

Replacing missing values

x = c(0,1,2,3,4,5,6,NA,NA)
w = c(0,1,2,3,4,5,6,NA,NA)
t = cbind(w,x)
y = data.frame(t)


# Replacing missing values for means


y[is.na(y$x)==TRUE,]$x = mean(y$x,na.rm=TRUE)

y$x

## [1] 0 1 2 3 4 5 6 3 3

Replacing outliers

# Load function replace.outliers
replace.outliers <- function(x){
    quantiles <- quantile( x, c(.05, .95 ) )
    x[ x < quantiles[1] ] <- quantiles[1]
    x[ x > quantiles[2] ] <- quantiles[2]
    x
}


# Create a dataset
x = rnorm(100) # Create a data set
x = c(21, 20, 25, x) # Introduce some outliers
quantiles <- quantile( x, c(.05, .95 ) ) # Calculates the quantiles for the 5th and 95th percentile values
quantiles[1] # this is the 5th percentile

##       5% 
## -1.60599

quantiles[2] # this is the 95th percentile

##      95% 
## 2.173251

# Finding the outliers

y = boxplot(x)
y$out

## [1] 21.000000 20.000000 25.000000 -2.221611 -3.447347 -2.229146

par(mfrow=c(1,2))
# Now you can run the function
# Before the replacement
boxplot(x, xlab="Before") # Before
x = replace.outliers(x)
# After the replacement
boxplot(x, xlab="After")