rm(list = ls())
####################
#Methods of Data Normalization
#1. Z-score Normalization(Standardization)
#2. Robust Scalar
#3. Min-Max Normalization
#4. Mean Normalization
#5. Unit Length
#####################1. Z-score Normalization(Standardization)
df <- data.frame(matrix(1:10, ncol = 2))
df
## X1 X2
## 1 1 6
## 2 2 7
## 3 3 8
## 4 4 9
## 5 5 10
(centered.x <- scale(df, scale = T))
## X1 X2
## [1,] -1.2649111 -1.2649111
## [2,] -0.6324555 -0.6324555
## [3,] 0.0000000 0.0000000
## [4,] 0.6324555 0.6324555
## [5,] 1.2649111 1.2649111
## attr(,"scaled:center")
## X1 X2
## 3 8
## attr(,"scaled:scale")
## X1 X2
## 1.581139 1.581139
sapply(df, quantile)
## X1 X2
## 0% 1 6
## 25% 2 7
## 50% 3 8
## 75% 4 9
## 100% 5 10
lapply(df, mean)
## $X1
## [1] 3
##
## $X2
## [1] 8
lapply(df, quantile)
## $X1
## 0% 25% 50% 75% 100%
## 1 2 3 4 5
##
## $X2
## 0% 25% 50% 75% 100%
## 6 7 8 9 10
#####################2. Robust Scalar
robust_scalar <- function(x){(x- median(x)) /(quantile(x,probs = .75)-quantile(x,probs = .25))}
sapply(df, robust_scalar)
## X1 X2
## [1,] -1.0 -1.0
## [2,] -0.5 -0.5
## [3,] 0.0 0.0
## [4,] 0.5 0.5
## [5,] 1.0 1.0
#####################3. Min-Max Normalization
norm_minmax <- function(x){
(x- min(x)) /(max(x)-min(x))
}
sapply(df, norm_minmax)
## X1 X2
## [1,] 0.00 0.00
## [2,] 0.25 0.25
## [3,] 0.50 0.50
## [4,] 0.75 0.75
## [5,] 1.00 1.00
#####################4. Mean Normalization
mean_norm_minmax <- function(x){
(x- mean(x)) /(max(x)-min(x))
}
sapply(df, mean_norm_minmax)
## X1 X2
## [1,] -0.50 -0.50
## [2,] -0.25 -0.25
## [3,] 0.00 0.00
## [4,] 0.25 0.25
## [5,] 0.50 0.50
class(sapply(df, mean_norm_minmax))
## [1] "matrix" "array"
#####################5. Unit Length
unit_length <- function(x) {
x / sqrt(sum(x^2))
}
sapply(df, unit_length)
## X1 X2
## [1,] 0.1348400 0.3302891
## [2,] 0.2696799 0.3853373
## [3,] 0.4045199 0.4403855
## [4,] 0.5393599 0.4954337
## [5,] 0.6741999 0.5504819
####################plot
library(ggplot2)
ggplot() +
geom_point(df, mapping = aes(x=X1, y=X2), color='darkgreen')

ggplot() +
geom_point(as.data.frame(sapply(df, unit_length)),
mapping = aes(x=X1, y=X2), color='darkgreen')

##ref https://medium.com/swlh/data-normalisation-with-r-6ef1d1947970