#One-hot encoding
rm(list = ls())
set.seed(2022)
data <- data.frame(
Outcome = seq(1,5,by=1),
Variable = sample(c("Red","Green","Blue"), 5, replace = TRUE))
data; class(data)
## Outcome Variable
## 1 1 Blue
## 2 2 Green
## 3 3 Blue
## 4 4 Blue
## 5 5 Green
## [1] "data.frame"
###################################### method____1
library(mltools)
library(data.table)
data$Variable <- as.factor(data$Variable)
one_hot(as.data.table(data))
## Outcome Variable_Blue Variable_Green
## 1: 1 1 0
## 2: 2 0 1
## 3: 3 1 0
## 4: 4 1 0
## 5: 5 0 1
one_hot(as.data.table(data), sparsifyNAs=TRUE)
## Outcome Variable_Blue Variable_Green
## 1: 1 1 0
## 2: 2 0 1
## 3: 3 1 0
## 4: 4 1 0
## 5: 5 0 1
one_hot(as.data.table(data), naCols=TRUE)
## Outcome Variable_Blue Variable_Green
## 1: 1 1 0
## 2: 2 0 1
## 3: 3 1 0
## 4: 4 1 0
## 5: 5 0 1
one_hot(as.data.table(data), dropCols=FALSE)
## Outcome Variable Variable_Blue Variable_Green
## 1: 1 Blue 1 0
## 2: 2 Green 0 1
## 3: 3 Blue 1 0
## 4: 4 Blue 1 0
## 5: 5 Green 0 1
one_hot(as.data.table(data), dropUnusedLevels=TRUE)
## Outcome Variable_Blue Variable_Green
## 1: 1 1 0
## 2: 2 0 1
## 3: 3 1 0
## 4: 4 1 0
## 5: 5 0 1
##################################### method____2
library(caret)
## 载入需要的程辑包:ggplot2
## 载入需要的程辑包:lattice
dummy <- dummyVars(" ~ .", data=data)
newdata <- data.frame(predict(dummy, newdata = data))
newdata
## Outcome Variable.Blue Variable.Green
## 1 1 1 0
## 2 2 0 1
## 3 3 1 0
## 4 4 1 0
## 5 5 0 1
##################################### method____3
library(reshape2)
##
## 载入程辑包:'reshape2'
## The following objects are masked from 'package:data.table':
##
## dcast, melt
newdata <- dcast(data = data, Outcome ~ Variable, length)
## Using Variable as value column: use value.var to override.
newdata
## Outcome Blue Green
## 1 1 1 0
## 2 2 0 1
## 3 3 1 0
## 4 4 1 0
## 5 5 0 1
#Applying one-hot encoding to multiple variables at the same time
set.seed(2022)
data <- data.frame(ID = seq(1,8,by=1),
Colour = sample(c("Red","Green","Blue"), 8, replace = TRUE),
Quality = sample(c("Poor","Average","Good"), 8, replace = TRUE)
)
data
## ID Colour Quality
## 1 1 Blue Poor
## 2 2 Green Average
## 3 3 Blue Good
## 4 4 Blue Poor
## 5 5 Green Average
## 6 6 Blue Good
## 7 7 Green Average
## 8 8 Blue Poor
# method_1
dummy <- dummyVars(" ~ .", data=data)
newdata <- data.frame(predict(dummy, newdata = data))
newdata
## ID ColourBlue ColourGreen QualityAverage QualityGood QualityPoor
## 1 1 1 0 0 0 1
## 2 2 0 1 1 0 0
## 3 3 1 0 0 1 0
## 4 4 1 0 0 0 1
## 5 5 0 1 1 0 0
## 6 6 1 0 0 1 0
## 7 7 0 1 1 0 0
## 8 8 1 0 0 0 1
# method_2
newdata <- dcast(data = melt(data, id.vars = "ID"), ID ~ variable + value, length)
newdata
## ID Colour_Blue Colour_Green Quality_Average Quality_Good Quality_Poor
## 1 1 1 0 0 0 1
## 2 2 0 1 1 0 0
## 3 3 1 0 0 1 0
## 4 4 1 0 0 0 1
## 5 5 0 1 1 0 0
## 6 6 1 0 0 1 0
## 7 7 0 1 1 0 0
## 8 8 1 0 0 0 1
#REF https://datatricks.co.uk/one-hot-encoding-in-r-three-simple-methods