Cleaning Data

Title: Standardized procedures to clean and organize data

Synopsis: This document is aimed at helping to clean and normalize a data set.

Summary

# Replacing missing values
# Replacing outliers
# Removing an entire column or line
# Changing data types

Replacing missing values

x = c(0,1,2,3,4,5,6,NA,NA)
w = c(0,1,2,3,4,5,6,NA,NA)
t = cbind(w,x)
y = data.frame(t)


# Replacing missing values for means


y[is.na(y$x)==TRUE,]$x = mean(y$x,na.rm=TRUE)

y$x

## [1] 0 1 2 3 4 5 6 3 3

Replacing outliers

# Load function replace.outliers
replace.outliers <- function(x){
    quantiles <- quantile( x, c(.05, .95 ) )
    x[ x < quantiles[1] ] <- quantiles[1]
    x[ x > quantiles[2] ] <- quantiles[2]
    x
}


# Create a dataset
x = rnorm(100) # Create a data set
x = c(21, 20, 25, x) # Introduce some outliers
quantiles <- quantile( x, c(.05, .95 ) ) # Calculates the quantiles for the 5th and 95th percentile values
quantiles[1] # this is the 5th percentile

##        5% 
## -1.567132

quantiles[2] # this is the 95th percentile

##      95% 
## 1.507845

# Finding the outliers

y = boxplot(x)

y$out

## [1] 21.000000 20.000000 25.000000 -2.618876 -3.118897

par(mfrow=c(1,2))
# Now you can run the function
# Before the replacement
boxplot(x, xlab="Before") # Before
x = replace.outliers(x)
# After the replacement
boxplot(x, xlab="After")

Removing an entire column or line

x = c(0,1,2,3,4,5,6,NA,NA)
w = c(0,1,2,3,4,5,6,NA,NA)
a = c(NA,NA,NA,NA,NA,NA,NA,NA,NA)
b = c(0,1,2,3,4,5,6,NA,NA)
t = cbind(w,x,a,b)
y = data.frame(t)
y

##    w  x  a  b
## 1  0  0 NA  0
## 2  1  1 NA  1
## 3  2  2 NA  2
## 4  3  3 NA  3
## 5  4  4 NA  4
## 6  5  5 NA  5
## 7  6  6 NA  6
## 8 NA NA NA NA
## 9 NA NA NA NA

# Remove a column

y = y[,-3]

# Remove a line

y = y[-8:-9,]

y

##   w x b
## 1 0 0 0
## 2 1 1 1
## 3 2 2 2
## 4 3 3 3
## 5 4 4 4
## 6 5 5 5
## 7 6 6 6

Changing data types

#
testando = mtcars
str(testando$wt)

##  num [1:32] 2.62 2.88 2.32 3.21 3.44 ...

testando$wt <- as.character(testando$wt)
str(testando$wt)

##  chr [1:32] "2.62" "2.875" "2.32" "3.215" "3.44" "3.46" "3.57" "3.19" ...