Cleaning Data
Title: Standardized procedures to clean and organize data
Synopsis: This document is aimed at helping to clean and normalize a
data set.
Summary
# Replacing missing values
# Replacing outliers
# Removing an entire column or line
# Changing data types
Replacing missing values
x = c(0,1,2,3,4,5,6,NA,NA)
w = c(0,1,2,3,4,5,6,NA,NA)
t = cbind(w,x)
y = data.frame(t)
# Replacing missing values for means
y[is.na(y$x)==TRUE,]$x = mean(y$x,na.rm=TRUE)
y$x
## [1] 0 1 2 3 4 5 6 3 3
Replacing outliers
# Load function replace.outliers
replace.outliers <- function(x){
quantiles <- quantile( x, c(.05, .95 ) )
x[ x < quantiles[1] ] <- quantiles[1]
x[ x > quantiles[2] ] <- quantiles[2]
x
}
# Create a dataset
x = rnorm(100) # Create a data set
x = c(21, 20, 25, x) # Introduce some outliers
quantiles <- quantile( x, c(.05, .95 ) ) # Calculates the quantiles for the 5th and 95th percentile values
quantiles[1] # this is the 5th percentile
## 5%
## -1.567132
quantiles[2] # this is the 95th percentile
## 95%
## 1.507845
# Finding the outliers
y = boxplot(x)

y$out
## [1] 21.000000 20.000000 25.000000 -2.618876 -3.118897
par(mfrow=c(1,2))
# Now you can run the function
# Before the replacement
boxplot(x, xlab="Before") # Before
x = replace.outliers(x)
# After the replacement
boxplot(x, xlab="After")

Removing an entire column or line
x = c(0,1,2,3,4,5,6,NA,NA)
w = c(0,1,2,3,4,5,6,NA,NA)
a = c(NA,NA,NA,NA,NA,NA,NA,NA,NA)
b = c(0,1,2,3,4,5,6,NA,NA)
t = cbind(w,x,a,b)
y = data.frame(t)
y
## w x a b
## 1 0 0 NA 0
## 2 1 1 NA 1
## 3 2 2 NA 2
## 4 3 3 NA 3
## 5 4 4 NA 4
## 6 5 5 NA 5
## 7 6 6 NA 6
## 8 NA NA NA NA
## 9 NA NA NA NA
# Remove a column
y = y[,-3]
# Remove a line
y = y[-8:-9,]
y
## w x b
## 1 0 0 0
## 2 1 1 1
## 3 2 2 2
## 4 3 3 3
## 5 4 4 4
## 6 5 5 5
## 7 6 6 6
Changing data types
#
testando = mtcars
str(testando$wt)
## num [1:32] 2.62 2.88 2.32 3.21 3.44 ...
testando$wt <- as.character(testando$wt)
str(testando$wt)
## chr [1:32] "2.62" "2.875" "2.32" "3.215" "3.44" "3.46" "3.57" "3.19" ...