Data 607 Entropy Presentation 2

Build some random data. Make “field_1 correlate well with”has_cancer“, but let the rest just be random numbers I’m just using binary data, think of it like a”risk factor" you either have it or you don’t

rm(list=ls())
options(max.print = 9999)

set.seed(123)
library(stringr)

## Warning: package 'stringr' was built under R version 3.4.4

has_cancer <- rbinom(n=100000, size=1, prob=0.005)
field_1 <- as.character(rbinom(n=100000, size=1, prob=0.005))
field_2 <- as.character(rbinom(n=100000, size=1, prob=0.05))
field_3 <- as.character(rbinom(n=100000, size=1, prob=0.005))
field_4 <- as.character(rbinom(n=100000, size=1, prob=0.7))
field_5 <- as.character(rbinom(n=100000, size=1, prob=0.01))
field_6 <- as.character(rbinom(n=100000, size=1, prob=0.1))
field_7 <- as.character(rbinom(n=100000, size=1, prob=0.0005))
field_8 <- as.character(rbinom(n=100000, size=1, prob=0.99))
field_9 <- as.character(rbinom(n=100000, size=1, prob=0.5))
field_10 <- as.character(rbinom(n=100000, size=1, prob=0.5))

data = data.frame(has_cancer,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10, stringsAsFactors = FALSE)


for (row in 1:nrow(data)) {
  rand = runif(1,0.00,1.00)
  if(data[row, "has_cancer"]==1) {
    if (rand > 0.95) {
      data[row,'field_1'] = '1'
    }
  }
}

Take a look at our data set

str(data)

## 'data.frame':    100000 obs. of  11 variables:
##  $ has_cancer: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ field_1   : chr  "0" "0" "0" "0" ...
##  $ field_2   : chr  "0" "0" "1" "0" ...
##  $ field_3   : chr  "0" "0" "0" "0" ...
##  $ field_4   : chr  "1" "1" "1" "1" ...
##  $ field_5   : chr  "0" "0" "0" "0" ...
##  $ field_6   : chr  "0" "0" "0" "0" ...
##  $ field_7   : chr  "0" "0" "0" "0" ...
##  $ field_8   : chr  "1" "1" "1" "1" ...
##  $ field_9   : chr  "1" "0" "1" "1" ...
##  $ field_10  : chr  "1" "0" "0" "1" ...

see what our average has_cancer rate is

paste("Overall Mean:",round(mean(data$has_cancer), 5))

## [1] "Overall Mean: 0.00492"

paste("Mean when Field_1 = 0:",round(mean(data$has_cancer[data$field_1==0]), 5))

## [1] "Mean when Field_1 = 0: 0.00471"

paste("Mean when Field_1 = 1:",round(mean(data$has_cancer[data$field_1==1]), 5))

## [1] "Mean when Field_1 = 1: 0.04545"

table(data$has_cancer, data$field_1)

##    
##         0     1
##   0 99025   483
##   1   469    23

calculate the parent entropy

overall_entropy = -1*(mean(data$has_cancer)*log(mean(data$has_cancer), base = 2)) - (1-mean(data$has_cancer))*log(1-mean(data$has_cancer), base = 2)
overall_entropy

## [1] 0.04480283

create data frame of columns so we can add the information_gain later

field_data <- data.frame(column_name = character(),
                         information_gain_add=double(),
                         stringsAsFactors = FALSE
)

loop through each column

for(col in names(data)) {
  field_data<-rbind(field_data, data.frame(column_name = col,
                                           information_gain_add = 0,
                                           stringsAsFactors = FALSE
  )
  )
  
}

field_data

##    column_name information_gain_add
## 1   has_cancer                    0
## 2      field_1                    0
## 3      field_2                    0
## 4      field_3                    0
## 5      field_4                    0
## 6      field_5                    0
## 7      field_6                    0
## 8      field_7                    0
## 9      field_8                    0
## 10     field_9                    0
## 11    field_10                    0

Let’s remove the target variable (has_cancer) as well

field_data= field_data[(field_data$column_name!='has_cancer'),]

pull the data new data set

field_performance <- data.frame(column_name = character(),
                                column_value = character(),
                                count=integer(),
                                has_cancer=double(),
                                entropy=double(),
                                information_gain = double(),
                                stringsAsFactors = FALSE
)

for(field_row in 1:nrow(field_data)) {
  
  #"col" is just the column name
  col = field_data[field_row,'column_name']
  
  #"values" is a list of unique values for the field
  values = unique(data[col])
  
  #loop through values
  for (value_row in 1:nrow(values)) {
    
    #"val" is the row we're looking at now, it corresponds to 1 unique value of 1 specific field
    val = values[value_row,]
    
    #Pull all of the rows that match this unique value from the original dataset into "data_temp"
    #I like to count missing (N/A) as it's own category
    if (is.na(val)) {
      data_temp = data[is.na(data[,col]),]
    } else {
      data_temp = data[data[,col] == val,]
    }
    
    # Now lets calculate some metrics
    
    #number of rows that match this value
    count = nrow(data_temp)
    
    #has_cancer is the average of the binary has_cancer variable
    #it can also be thought of as probability that has_cancer = 1
    has_cancer = mean(data_temp$has_cancer)
    
    #calculate entropy
    # We can't calulate the log of 0, but we know the entropy will be 1 in that situation
    if (has_cancer ==0 | has_cancer == 1) {
      entropy = 0
    } else {
      entropy = -1*(has_cancer*log(has_cancer, base = 2)) - (1-has_cancer)*log(1-has_cancer, base = 2)
    }
    
    # information_gain is the amount that we'd add to calculate information gain for the full field
    information_gain = count/nrow(data) * entropy
    
    #Add this to the "information_gain_add" of main data frame
    field_data[field_row,'information_gain_add'] = field_data[field_row,'information_gain_add'] +information_gain
    
    field_performance<-rbind(field_performance, data.frame(column_name = toString(col),
                                                           column_value = toString(val),
                                                           count=count,
                                                           has_cancer=round(has_cancer,4),
                                                           entropy = entropy,
                                                           information_gain = information_gain,
                                                           stringsAsFactors = FALSE
    )
    )
  }
  
}

field_performance[order(-field_performance$entropy),]

##    column_name column_value count has_cancer    entropy information_gain
## 2      field_1            1   506     0.0455 0.26676499     1.349831e-03
## 14     field_7            1    53     0.0189 0.13503620     7.156919e-05
## 10     field_5            1   971     0.0082 0.06887797     6.688050e-04
## 16     field_8            0   982     0.0061 0.05372441     5.275737e-04
## 17     field_9            1 49776     0.0052 0.04680907     2.329968e-02
## 7      field_4            1 70188     0.0051 0.04628974     3.248984e-02
## 12     field_6            1 10114     0.0050 0.04573910     4.626053e-03
## 3      field_2            0 94890     0.0050 0.04513685     4.283036e-02
## 20    field_10            0 49899     0.0049 0.04503249     2.247076e-02
## 5      field_3            0 99502     0.0049 0.04483748     4.461419e-02
## 13     field_7            0 99947     0.0049 0.04474617     4.472245e-02
## 15     field_8            1 99018     0.0049 0.04471241     4.427333e-02
## 11     field_6            0 89886     0.0049 0.04469720     4.017653e-02
## 19    field_10            1 50101     0.0049 0.04457383     2.233194e-02
## 9      field_5            0 99029     0.0049 0.04455339     4.412078e-02
## 1      field_1            0 99494     0.0047 0.04321738     4.299870e-02
## 18     field_9            0 50224     0.0047 0.04279433     2.149303e-02
## 8      field_4            0 29812     0.0045 0.04125710     1.229957e-02
## 4      field_2            1  5110     0.0041 0.03849253     1.966969e-03
## 6      field_3            1   498     0.0040 0.03775019     1.879959e-04

Calculate Overall Information Gain and Display

field_data$information_gain = overall_entropy - field_data$information_gain_add
field_data[order(-field_data$information_gain),]

##    column_name information_gain_add information_gain
## 2      field_1           0.04434853     4.542946e-04
## 5      field_4           0.04478941     1.341752e-05
## 6      field_5           0.04478959     1.324292e-05
## 10     field_9           0.04479271     1.012060e-05
## 8      field_7           0.04479402     8.810233e-06
## 3      field_2           0.04479732     5.505240e-06
## 9      field_8           0.04480091     1.922218e-06
## 4      field_3           0.04480219     6.428654e-07
## 7      field_6           0.04480258     2.470461e-07
## 11    field_10           0.04480270     1.320625e-07

Data 607 Entropy Presentation 2

Steven Ellingson

September 11, 2019