#One-hot encoding
rm(list = ls())
set.seed(2022)
data <- data.frame(
  Outcome = seq(1,5,by=1),
  Variable = sample(c("Red","Green","Blue"), 5, replace = TRUE))
data; class(data)
##   Outcome Variable
## 1       1     Blue
## 2       2    Green
## 3       3     Blue
## 4       4     Blue
## 5       5    Green
## [1] "data.frame"
######################################   method____1
library(mltools)
library(data.table)
data$Variable <- as.factor(data$Variable)
one_hot(as.data.table(data))
##    Outcome Variable_Blue Variable_Green
## 1:       1             1              0
## 2:       2             0              1
## 3:       3             1              0
## 4:       4             1              0
## 5:       5             0              1
one_hot(as.data.table(data), sparsifyNAs=TRUE)
##    Outcome Variable_Blue Variable_Green
## 1:       1             1              0
## 2:       2             0              1
## 3:       3             1              0
## 4:       4             1              0
## 5:       5             0              1
one_hot(as.data.table(data), naCols=TRUE)
##    Outcome Variable_Blue Variable_Green
## 1:       1             1              0
## 2:       2             0              1
## 3:       3             1              0
## 4:       4             1              0
## 5:       5             0              1
one_hot(as.data.table(data), dropCols=FALSE)
##    Outcome Variable Variable_Blue Variable_Green
## 1:       1     Blue             1              0
## 2:       2    Green             0              1
## 3:       3     Blue             1              0
## 4:       4     Blue             1              0
## 5:       5    Green             0              1
one_hot(as.data.table(data), dropUnusedLevels=TRUE)
##    Outcome Variable_Blue Variable_Green
## 1:       1             1              0
## 2:       2             0              1
## 3:       3             1              0
## 4:       4             1              0
## 5:       5             0              1
#####################################   method____2
library(caret)
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
dummy <- dummyVars(" ~ .", data=data)
newdata <- data.frame(predict(dummy, newdata = data)) 
newdata
##   Outcome Variable.Blue Variable.Green
## 1       1             1              0
## 2       2             0              1
## 3       3             1              0
## 4       4             1              0
## 5       5             0              1
#####################################   method____3
library(reshape2)
## 
## 载入程辑包:'reshape2'
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
newdata <- dcast(data = data, Outcome ~ Variable, length)
## Using Variable as value column: use value.var to override.
newdata
##   Outcome Blue Green
## 1       1    1     0
## 2       2    0     1
## 3       3    1     0
## 4       4    1     0
## 5       5    0     1
#Applying one-hot encoding to multiple variables at the same time
set.seed(2022)
data <- data.frame(ID = seq(1,8,by=1),
                   Colour = sample(c("Red","Green","Blue"), 8, replace = TRUE),
                   Quality = sample(c("Poor","Average","Good"), 8, replace = TRUE)
)
data
##   ID Colour Quality
## 1  1   Blue    Poor
## 2  2  Green Average
## 3  3   Blue    Good
## 4  4   Blue    Poor
## 5  5  Green Average
## 6  6   Blue    Good
## 7  7  Green Average
## 8  8   Blue    Poor
#  method_1
dummy <- dummyVars(" ~ .", data=data)
newdata <- data.frame(predict(dummy, newdata = data))
newdata
##   ID ColourBlue ColourGreen QualityAverage QualityGood QualityPoor
## 1  1          1           0              0           0           1
## 2  2          0           1              1           0           0
## 3  3          1           0              0           1           0
## 4  4          1           0              0           0           1
## 5  5          0           1              1           0           0
## 6  6          1           0              0           1           0
## 7  7          0           1              1           0           0
## 8  8          1           0              0           0           1
#  method_2
newdata <- dcast(data = melt(data, id.vars = "ID"), ID ~ variable + value, length)
newdata
##   ID Colour_Blue Colour_Green Quality_Average Quality_Good Quality_Poor
## 1  1           1            0               0            0            1
## 2  2           0            1               1            0            0
## 3  3           1            0               0            1            0
## 4  4           1            0               0            0            1
## 5  5           0            1               1            0            0
## 6  6           1            0               0            1            0
## 7  7           0            1               1            0            0
## 8  8           1            0               0            0            1
#REF https://datatricks.co.uk/one-hot-encoding-in-r-three-simple-methods