Gini-Impurity.R

#Calculates the Gini Impurity of a set
#Gini Impurity is a measurement of the likelihood of an incorrect classification of a new instance of a random variable, 
#if that new instance were randomly classified according to the distribution of class labels from the data set.
#Gini impurity is lower bounded by 0, with 0 occurring if the data set contains only one class.
#################################method_1
#install.packages('mltools')
library(mltools)
gini_impurity(c("red", "red", "blue", "green"))

## [1] 0.625

gini_impurity(c(red=2, blue=1, green=1))

## [1] 0.625

gini_impurity(c(3,3))

## [1] 0.5

gini_impurity(c(2,1,1))

## [1] 0.625

#################################method_2
gini_impurity_c <- function(vals){
  # Returns the gini impurity of a set of values
  # vals can either be raw category instances (e.g. c("red", "red", "blue", "green")) or named category frequencies (e.g. c(red=2, blue=1, green=1))
  # Gini Impurity is the probability a value is incorrectly labeled when labeled according to the distribution of classes in the set
  
  if(is(vals, "numeric")) counts <- vals else counts <- table(vals)
  total <- sum(counts)
  
  return(sum((counts/total)*(1-counts/total)))
}

vals_list <- c("red", "red", "blue", "green")
gini_impurity_c(vals_list)

## [1] 0.625

#REF
#https://bambielli.com/til/2017-10-29-gini-impurity/
#package 'mltools'
#https://github.com/ben519/mltools/blob/master/R/gini_impurity.R

Gini-Impurity.R

liyix

2020-09-13