SOURCE: https://medium.com/@sjacks/feature-transformation-21282d1a3215
\[ X' = log(X) \]
\[ X' = \frac{X - X{min}}{X{max} - X{min}} \]
\[ X' = \frac{X}{max(abs(X))} \]
\[ X' = \frac{X - \mu}{\sigma} \]
\[ X' = \frac{X - Q_1(X)}{Q_3(X) - Q_1(X)}\]
\[ x(\lambda) = \{^{\frac{x^\lambda - 1}{\lambda}, \space if \space \lambda \ne 0 }_{log(x), \space if \space \lambda = 0} \]
library(ggplot2)
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
library(hrbrthemes)
#> NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
#> Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
#> if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(bestNormalize)
df <- scorecard::germancredit
df <- df %>%
mutate(amt__original = credit.amount,
amt_log = log(credit.amount),
amt_min_max = (credit.amount - min(credit.amount))/(max(credit.amount) - min(credit.amount)),
amt_max_abs = credit.amount / max(abs(credit.amount)),
amt_standard = (credit.amount - mean(credit.amount)) / sd(credit.amount),
amt_robust = (credit.amount - quantile(credit.amount, 1/4)) / (quantile(credit.amount, 3/4) - quantile(credit.amount, 1/4)),
amt_power = bestNormalize(credit.amount) %>% predict()
)p <- df %>%
select(starts_with("amt")) %>%
tidyr::pivot_longer(cols = starts_with("amt")) %>%
ggplot(aes(x = value, fill = name, color = name))+
geom_histogram()+
scale_color_ipsum() +
scale_fill_ipsum() +
facet_wrap(~name, scales = "free", ncol = 3)+
labs(title = "Compare feature transformation",
x = NULL,
y = NULL)+
theme_ipsum_ps(grid="XY", axis="xy") +
theme(legend.position="none")
p
#> `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.