Build some random data. Make “field_1 correlate well with”has_cancer“, but let the rest just be random numbers I’m just using binary data, think of it like a”risk factor" you either have it or you don’t
rm(list=ls())
options(max.print = 9999)
set.seed(123)
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
has_cancer <- rbinom(n=100000, size=1, prob=0.005)
field_1 <- as.character(rbinom(n=100000, size=1, prob=0.005))
field_2 <- as.character(rbinom(n=100000, size=1, prob=0.05))
field_3 <- as.character(rbinom(n=100000, size=1, prob=0.005))
field_4 <- as.character(rbinom(n=100000, size=1, prob=0.7))
field_5 <- as.character(rbinom(n=100000, size=1, prob=0.01))
field_6 <- as.character(rbinom(n=100000, size=1, prob=0.1))
field_7 <- as.character(rbinom(n=100000, size=1, prob=0.0005))
field_8 <- as.character(rbinom(n=100000, size=1, prob=0.99))
field_9 <- as.character(rbinom(n=100000, size=1, prob=0.5))
field_10 <- as.character(rbinom(n=100000, size=1, prob=0.5))
data = data.frame(has_cancer,field_1,field_2,field_3,field_4,field_5,field_6,field_7,field_8,field_9,field_10, stringsAsFactors = FALSE)
for (row in 1:nrow(data)) {
rand = runif(1,0.00,1.00)
if(data[row, "has_cancer"]==1) {
if (rand > 0.95) {
data[row,'field_1'] = '1'
}
}
}
Take a look at our data set
str(data)
## 'data.frame': 100000 obs. of 11 variables:
## $ has_cancer: int 0 0 0 0 0 0 0 0 0 0 ...
## $ field_1 : chr "0" "0" "0" "0" ...
## $ field_2 : chr "0" "0" "1" "0" ...
## $ field_3 : chr "0" "0" "0" "0" ...
## $ field_4 : chr "1" "1" "1" "1" ...
## $ field_5 : chr "0" "0" "0" "0" ...
## $ field_6 : chr "0" "0" "0" "0" ...
## $ field_7 : chr "0" "0" "0" "0" ...
## $ field_8 : chr "1" "1" "1" "1" ...
## $ field_9 : chr "1" "0" "1" "1" ...
## $ field_10 : chr "1" "0" "0" "1" ...
see what our average has_cancer rate is
paste("Overall Mean:",round(mean(data$has_cancer), 5))
## [1] "Overall Mean: 0.00492"
paste("Mean when Field_1 = 0:",round(mean(data$has_cancer[data$field_1==0]), 5))
## [1] "Mean when Field_1 = 0: 0.00471"
paste("Mean when Field_1 = 1:",round(mean(data$has_cancer[data$field_1==1]), 5))
## [1] "Mean when Field_1 = 1: 0.04545"
table(data$has_cancer, data$field_1)
##
## 0 1
## 0 99025 483
## 1 469 23
calculate the parent entropy
overall_entropy = -1*(mean(data$has_cancer)*log(mean(data$has_cancer), base = 2)) - (1-mean(data$has_cancer))*log(1-mean(data$has_cancer), base = 2)
overall_entropy
## [1] 0.04480283
create data frame of columns so we can add the information_gain later
field_data <- data.frame(column_name = character(),
information_gain_add=double(),
stringsAsFactors = FALSE
)
loop through each column
for(col in names(data)) {
field_data<-rbind(field_data, data.frame(column_name = col,
information_gain_add = 0,
stringsAsFactors = FALSE
)
)
}
field_data
## column_name information_gain_add
## 1 has_cancer 0
## 2 field_1 0
## 3 field_2 0
## 4 field_3 0
## 5 field_4 0
## 6 field_5 0
## 7 field_6 0
## 8 field_7 0
## 9 field_8 0
## 10 field_9 0
## 11 field_10 0
Let’s remove the target variable (has_cancer) as well
field_data= field_data[(field_data$column_name!='has_cancer'),]
pull the data new data set
field_performance <- data.frame(column_name = character(),
column_value = character(),
count=integer(),
has_cancer=double(),
entropy=double(),
information_gain = double(),
stringsAsFactors = FALSE
)
for(field_row in 1:nrow(field_data)) {
#"col" is just the column name
col = field_data[field_row,'column_name']
#"values" is a list of unique values for the field
values = unique(data[col])
#loop through values
for (value_row in 1:nrow(values)) {
#"val" is the row we're looking at now, it corresponds to 1 unique value of 1 specific field
val = values[value_row,]
#Pull all of the rows that match this unique value from the original dataset into "data_temp"
#I like to count missing (N/A) as it's own category
if (is.na(val)) {
data_temp = data[is.na(data[,col]),]
} else {
data_temp = data[data[,col] == val,]
}
# Now lets calculate some metrics
#number of rows that match this value
count = nrow(data_temp)
#has_cancer is the average of the binary has_cancer variable
#it can also be thought of as probability that has_cancer = 1
has_cancer = mean(data_temp$has_cancer)
#calculate entropy
# We can't calulate the log of 0, but we know the entropy will be 1 in that situation
if (has_cancer ==0 | has_cancer == 1) {
entropy = 0
} else {
entropy = -1*(has_cancer*log(has_cancer, base = 2)) - (1-has_cancer)*log(1-has_cancer, base = 2)
}
# information_gain is the amount that we'd add to calculate information gain for the full field
information_gain = count/nrow(data) * entropy
#Add this to the "information_gain_add" of main data frame
field_data[field_row,'information_gain_add'] = field_data[field_row,'information_gain_add'] +information_gain
field_performance<-rbind(field_performance, data.frame(column_name = toString(col),
column_value = toString(val),
count=count,
has_cancer=round(has_cancer,4),
entropy = entropy,
information_gain = information_gain,
stringsAsFactors = FALSE
)
)
}
}
field_performance[order(-field_performance$entropy),]
## column_name column_value count has_cancer entropy information_gain
## 2 field_1 1 506 0.0455 0.26676499 1.349831e-03
## 14 field_7 1 53 0.0189 0.13503620 7.156919e-05
## 10 field_5 1 971 0.0082 0.06887797 6.688050e-04
## 16 field_8 0 982 0.0061 0.05372441 5.275737e-04
## 17 field_9 1 49776 0.0052 0.04680907 2.329968e-02
## 7 field_4 1 70188 0.0051 0.04628974 3.248984e-02
## 12 field_6 1 10114 0.0050 0.04573910 4.626053e-03
## 3 field_2 0 94890 0.0050 0.04513685 4.283036e-02
## 20 field_10 0 49899 0.0049 0.04503249 2.247076e-02
## 5 field_3 0 99502 0.0049 0.04483748 4.461419e-02
## 13 field_7 0 99947 0.0049 0.04474617 4.472245e-02
## 15 field_8 1 99018 0.0049 0.04471241 4.427333e-02
## 11 field_6 0 89886 0.0049 0.04469720 4.017653e-02
## 19 field_10 1 50101 0.0049 0.04457383 2.233194e-02
## 9 field_5 0 99029 0.0049 0.04455339 4.412078e-02
## 1 field_1 0 99494 0.0047 0.04321738 4.299870e-02
## 18 field_9 0 50224 0.0047 0.04279433 2.149303e-02
## 8 field_4 0 29812 0.0045 0.04125710 1.229957e-02
## 4 field_2 1 5110 0.0041 0.03849253 1.966969e-03
## 6 field_3 1 498 0.0040 0.03775019 1.879959e-04
Calculate Overall Information Gain and Display
field_data$information_gain = overall_entropy - field_data$information_gain_add
field_data[order(-field_data$information_gain),]
## column_name information_gain_add information_gain
## 2 field_1 0.04434853 4.542946e-04
## 5 field_4 0.04478941 1.341752e-05
## 6 field_5 0.04478959 1.324292e-05
## 10 field_9 0.04479271 1.012060e-05
## 8 field_7 0.04479402 8.810233e-06
## 3 field_2 0.04479732 5.505240e-06
## 9 field_8 0.04480091 1.922218e-06
## 4 field_3 0.04480219 6.428654e-07
## 7 field_6 0.04480258 2.470461e-07
## 11 field_10 0.04480270 1.320625e-07