#Calculates the Gini Impurity of a set
#Gini Impurity is a measurement of the likelihood of an incorrect classification of a new instance of a random variable,
#if that new instance were randomly classified according to the distribution of class labels from the data set.
#Gini impurity is lower bounded by 0, with 0 occurring if the data set contains only one class.
#################################method_1
#install.packages('mltools')
library(mltools)
gini_impurity(c("red", "red", "blue", "green"))
## [1] 0.625
gini_impurity(c(red=2, blue=1, green=1))
## [1] 0.625
gini_impurity(c(3,3))
## [1] 0.5
gini_impurity(c(2,1,1))
## [1] 0.625
#################################method_2
gini_impurity_c <- function(vals){
# Returns the gini impurity of a set of values
# vals can either be raw category instances (e.g. c("red", "red", "blue", "green")) or named category frequencies (e.g. c(red=2, blue=1, green=1))
# Gini Impurity is the probability a value is incorrectly labeled when labeled according to the distribution of classes in the set
if(is(vals, "numeric")) counts <- vals else counts <- table(vals)
total <- sum(counts)
return(sum((counts/total)*(1-counts/total)))
}
vals_list <- c("red", "red", "blue", "green")
gini_impurity_c(vals_list)
## [1] 0.625
#REF
#https://bambielli.com/til/2017-10-29-gini-impurity/
#package 'mltools'
#https://github.com/ben519/mltools/blob/master/R/gini_impurity.R