library('caret')
## Loading required package: ggplot2
## Loading required package: lattice
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('ggplot2')
library("corrplot")
## corrplot 0.95 loaded
library("randomForest")
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
zip_url <- "https://s3.us-east-2.amazonaws.com/artificium.us/datasets/donations-salted.csv.zip"
zip_file <- tempfile(fileext = ".zip")
download.file(zip_url, zip_file, mode = 'wb')
df <- read.csv(unz(zip_file, "donations-salted.csv"), header = TRUE, sep = ",")
head(df)
str(df)
## 'data.frame': 19372 obs. of 20 variables:
## $ CONTROL_NUMBER : int 5 12 37 38 41 52 53 67 70 71 ...
## $ DONATED : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ DONATION_AMT : int NA 1000 NA NA NA NA NA 500 NA 1600 ...
## $ DONOR_AGE : int 87 79 75 NA 74 63 71 79 41 63 ...
## $ URBANICITY : chr "?" "R" "S" "U" ...
## $ DONOR_GENDER : chr "M" "M" "F" "F" ...
## $ HOME_OWNER : chr "H" "H" "H" "H" ...
## $ INCOME_LEVEL : int 2 7 5 6 2 3 5 1 4 4 ...
## $ SES : chr "." "45" "11" "4" ...
## $ MEDIAN_HOME_VALUE : num 554000 334000 2388000 1688000 514000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 8 0 0 0 0 0 ...
## $ MONTHS_SINCE_LAST_GIFT : int 26 7 6 6 18 19 21 9 17 9 ...
## $ EMAILS_12 : int 17 28 43 44 25 29 25 33 36 38 ...
## $ LIFETIME_GIFT_COUNT : int 35 25 36 37 12 22 20 20 34 22 ...
## $ LIFETIME_EMAILS : int 19 32 44 31 30 22 29 33 36 27 ...
## $ LIFETIME_GIFT_AMOUNT : num 29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 500 500 300 500 300 500 300 300 300 ...
## $ LIFETIME_AVG_GIFT_AMT : int 849 1472 1675 1176 883 582 1100 505 503 682 ...
summary(df)
## CONTROL_NUMBER DONATED DONATION_AMT DONOR_AGE
## Min. : 5 Mode :logical Min. : 100 Min. : 0.00
## 1st Qu.: 48289 FALSE:14529 1st Qu.: 1000 1st Qu.:47.00
## Median : 96937 TRUE :4843 Median : 1300 Median :60.00
## Mean : 96546 Mean : 1562 Mean :58.92
## 3rd Qu.:145430 3rd Qu.: 2000 3rd Qu.:73.00
## Max. :191779 Max. :20000 Max. :87.00
## NA's :14529 NA's :4795
## URBANICITY DONOR_GENDER HOME_OWNER INCOME_LEVEL
## Length:19372 Length:19372 Length:19372 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.908
## 3rd Qu.:5.000
## Max. :7.000
## NA's :4392
## SES MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## Length:19372 Min. : 0 Min. : 0 Min. : 0.000
## Class :character 1st Qu.: 518000 1st Qu.: 232000 1st Qu.: 0.000
## Mode :character Median : 747000 Median : 311000 Median : 0.000
## Mean :1079872 Mean : 341970 Mean : 3.362
## 3rd Qu.:1227000 3rd Qu.: 417000 3rd Qu.: 3.000
## Max. :6000000 Max. :1500000 Max. :241.000
##
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT LIFETIME_EMAILS
## Min. : 4.00 Min. : 2.00 Min. : 1.00 Min. : 2.00
## 1st Qu.:16.00 1st Qu.:16.00 1st Qu.: 4.00 1st Qu.:11.00
## Median :18.00 Median :18.00 Median : 8.00 Median :18.00
## Mean :18.19 Mean :18.27 Mean : 9.98 Mean :18.67
## 3rd Qu.:20.00 3rd Qu.:19.00 3rd Qu.:14.00 3rd Qu.:26.00
## Max. :27.00 Max. :77.00 Max. :95.00 Max. :56.00
##
## LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT LIFETIME_MIN_GIFT_AMT
## Min. : 1500 Min. : 500 Min. : 0.0
## 1st Qu.: 4200 1st Qu.: 1200 1st Qu.: 300.0
## Median : 7900 Median : 1600 Median : 500.0
## Mean : 10443 Mean : 1921 Mean : 762.1
## 3rd Qu.: 13200 3rd Qu.: 2100 3rd Qu.: 1000.0
## Max. :377500 Max. :100000 Max. :45000.0
##
## LIFETIME_AVG_GIFT_AMT
## Min. : 136
## 1st Qu.: 800
## Median : 1120
## Mean : 1286
## 3rd Qu.: 1500
## Max. :45000
##
# check for NA values
colSums(is.na(df))
## CONTROL_NUMBER DONATED DONATION_AMT
## 0 0 14529
## DONOR_AGE URBANICITY DONOR_GENDER
## 4795 0 0
## HOME_OWNER INCOME_LEVEL SES
## 0 4392 0
## MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## 0 0 0
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT
## 0 0 0
## LIFETIME_EMAILS LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT
## 0 0 0
## LIFETIME_MIN_GIFT_AMT LIFETIME_AVG_GIFT_AMT
## 0 0
sum(is.na(df))
## [1] 23716
# check for empty strings
any(apply(df, 2, function(x) any(x == ""| x == " ")))
## [1] NA
# check for unknown entries in the data frame
colSums(df == "uknown"|df == "Uknown")
## CONTROL_NUMBER DONATED DONATION_AMT
## 0 0 NA
## DONOR_AGE URBANICITY DONOR_GENDER
## NA 0 0
## HOME_OWNER INCOME_LEVEL SES
## 0 NA 0
## MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## 0 0 0
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT
## 0 0 0
## LIFETIME_EMAILS LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT
## 0 0 0
## LIFETIME_MIN_GIFT_AMT LIFETIME_AVG_GIFT_AMT
## 0 0
Lets investigate columns : INCOME_LEVEL , DONATION_AMT , DONOR_AGE
There are no empty strings in the data frame. Yay!
There are no “Unknown” or “unknown” entries in the data frame. Yay!
# DONATION_AMT
#DONATION_AMT: the monetary amount of the most recent gift
unique(df$DONATION_AMT)
## [1] NA 1000 500 1600 300 1200 800 1500 3300 2000 2500 3500
## [13] 200 1100 1800 4000 2100 3200 3000 1400 2400 5000 1250 700
## [25] 900 400 600 2800 1300 1700 2700 3600 2200 2300 3100 1900
## [37] 4800 7500 10000 4700 4400 9500 100 10200 10100 3400 1070 6000
## [49] 4500 2600 1750 1687 3800 4100 20000 3700 1825 2900 5100 4600
## [61] 4200 250 4300 5300 750 525 5500 450 4421 1392 15000
sum(is.na(df$DONATION_AMT))
## [1] 14529
# lets look at the scale of missingness with NA values
14529 / nrow(df) * 100
## [1] 75
# lets look at the rows
df[is.na(df$DONATION_AMT), ]
# lets impute the NA with 0 donation amount
df$DONATION_AMT[is.na(df$DONATION_AMT)] <- 0
# verify the change
sum(is.na(df$DONATION_AMT))
## [1] 0
There are 14,529 NA vlaues in the DONATION_AMT representing 75%.
Using the mean would skew the distribution in the data set.
The rows who’s DONATION_AMT is NA have no specific pattern
SOLUTION : Impute the NA with 0 donation amount
# DONOR_AGE
# DONOR_AGE - the age of donors in years
unique(df$DONOR_AGE)
## [1] 87 79 75 NA 74 63 71 41 73 81 76 69 45 43 55 77 37 52 30 67 65 59 42 58 56
## [26] 80 82 64 48 85 47 70 40 32 53 86 54 33 60 49 7 17 57 78 50 39 66 51 72 38
## [51] 61 68 24 16 84 44 34 83 35 29 26 27 46 31 62 23 36 18 6 25 28 19 21 22 0
## [76] 2 12 20 15 4 14
sum(is.na(df$DONOR_AGE))
## [1] 4795
# lets look at the rows with NA values
df[is.na(df$DONOR_AGE), ]
table(df$DONOR_AGE)
##
## 0 2 4 6 7 12 14 15 16 17 18 19 20 21 22 23 24 25 26 27
## 2 3 1 10 74 1 1 2 11 101 2 5 1 13 9 23 10 46 35 124
## 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 37 79 55 111 65 137 80 202 117 282 112 244 160 261 179 278 170 343 178 358
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## 212 404 237 364 177 321 212 361 181 365 186 320 169 264 208 271 178 347 228 411
## 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
## 231 343 225 350 239 395 295 380 281 381 251 326 215 276 192 239 140 174 144 162
summary(df$DONOR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 47.00 60.00 58.92 73.00 87.00 4795
# sum of donors who are 18 years and below
sum(df$DONOR_AGE < 18, na.rm = TRUE)
## [1] 206
# lets use a threshold of 18 and equate all donors <18 to NA for age
df$DONOR_AGE[df$DONOR_AGE <18] <- NA
# impute the NA with the median age
median_age <- median(df$DONOR_AGE, na.rm = TRUE)
df$DONOR_AGE[is.na(df$DONOR_AGE)] <- median_age
# verify the change
sum(is.na(df$DONOR_AGE))
## [1] 0
table(df$DONOR_AGE)
##
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## 2 5 1 13 9 23 10 46 35 124 37 79 55 111 65 137
## 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 80 202 117 282 112 244 160 261 179 278 170 343 178 358 212 404
## 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
## 237 364 177 321 212 361 181 365 186 320 5170 264 208 271 178 347
## 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
## 228 411 231 343 225 350 239 395 295 380 281 381 251 326 215 276
## 82 83 84 85 86 87
## 192 239 140 174 144 162
There are 4795 NA values in DONOR_AGE
There is a donor at 0 age. I think thats likley an error
ACTION: I set a threshold of 18 and all DONOR-AGE below 18 were set to NA then imputed all NA vlaues with the median age.
# INCOME_LEVEL: one of 7 income levels; higher means more income; exact bracket structure is unknown
unique(df$INCOME_LEVEL)
## [1] 2 7 5 6 3 1 4 NA
table(df$INCOME_LEVEL)
##
## 1 2 3 4 5 6 7
## 1822 2626 1699 2526 3149 1591 1567
sum(is.na(df$INCOME_LEVEL))
## [1] 4392
df[is.na(df$INCOME_LEVEL), ]
# check what levsls 1- 7 mean using the donation amount col
tapply(df$DONATION_AMT, df$INCOME_LEVEL, mean, na.rm = TRUE)
## 1 2 3 4 5 6 7
## 277.0033 331.8751 357.2101 387.3120 425.6812 473.1301 481.0466
# imput the the NA values with the median
median_income <- median(df$INCOME_LEVEL, na.rm = TRUE)
df$INCOME_LEVEL[is.na(df$INCOME_LEVEL)] <- median_income
# verify the change
sum(is.na(df$INCOME_LEVEL))
## [1] 0
There are 4392 NA values in INCOME_LEVEL
There is no specific pattern in the rows of the involved na column
I compared INCOME_LEVEL with DONATION_AMT to understand the what levels 1-7 mean. They mean lowest to highest
ACTION: Impute the NA values with the median
# SES: socio-economic status for donor cluster; higher number means a higher socio-economic status
unique(df$SES)
## [1] "." "45" "11" "4" "49" "8" "50" "28" "30" "43" "53" "42" "46" "20" "16"
## [16] "40" "7" "34" "23" "35" "41" "25" "10" "1" "9" "2" "12" "14" "37" "36"
## [31] "15" "39" "38" "18" "48" "24" "3" "13" "31" "5" "27" "19" "51" "22" "17"
## [46] "26" "21" "44" "6" "29" "33" "47" "32" "52"
table(unique(df$SES))
##
## . 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30 31
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 6 7
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 8 9
## 1 1
# compare SES col with DONATION AMT
tapply(df$DONATION_AMT, df$SES, mean, na.rm = TRUE)
## . 1 10 11 12 13 14 15
## 630.0022 581.5900 277.5194 441.9421 415.3724 600.6908 468.8326 426.2332
## 16 17 18 19 2 20 21 22
## 366.1458 377.9370 420.5977 336.7347 407.7632 445.7413 385.0567 427.6892
## 23 24 25 26 27 28 29 3
## 369.1126 426.8553 335.8974 337.6238 378.8288 487.1720 451.7647 524.7500
## 30 31 32 33 34 35 36 37
## 253.9499 389.1566 267.7632 288.9908 390.8451 473.0399 350.8380 328.9216
## 38 39 4 40 41 42 43 44
## 392.9167 364.4531 398.2301 414.2169 276.9142 416.0458 308.5470 300.3916
## 45 46 47 48 49 5 50 51
## 287.1203 406.7751 228.1081 345.0000 321.3333 380.9045 373.0769 278.6957
## 52 53 6 7 8 9
## 233.3333 419.8020 308.9431 433.6957 316.4021 294.7712
# impute the "." with the mode of SES column
mode_SES <- as.numeric(sort(table(df$SES), decreasing = TRUE)[1])
df$SES[df$SES == "."] <- mode_SES
# verify the change
table(unique(df$SES))
##
## 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 28 29 3 30 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 46 47 48 49 5 50 51 52 53 6 7 8 830 9
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
df$SES <- as.numeric(df$SES)
There is a “.” entry in the SES column.
I compared it with the DONATION AMT column. There is no order to the groups of donor clusters hence SES is not ordinal but categorical
ACTION: Impute the “.” with the mode.
# URBANICITY: the “urbanicity” of the donor’s place of residence:
# U = urban (i.e., lives in a (large) city),
# S = suburban (outside but near the city),
# T = town (lives in a “small” town),
# R = rural (e.g., farm, away from town) ,
# ? = unknown
unique(df$URBANICITY)
## [1] "?" "R" "S" "U" "C" "T"
table(df$URBANICITY, useNA = "always")
##
## ? C R S T U <NA>
## 454 4022 4005 4491 3944 2456 0
df$URBANICITY[df$URBANICITY == "?"] <- NA
df$URBANICITY[df$URBANICITY == "C"] <- NA
# verify
unique(df$URBANICITY)
## [1] NA "R" "S" "U" "T"
# impute the NA with the mode
mode_URBANICITY <- names(sort(table(df$URBANICITY), decreasing = TRUE)[1])
df$URBANICITY[is.na(df$URBANICITY)] <- mode_URBANICITY
# verify the change
table(df$URBANICITY, useNA = "always")
##
## R S T U <NA>
## 4005 8967 3944 2456 0
proportions <- prop.table(table(df$URBANICITY[!is.na(df$URBANICITY)]))
proportions
##
## R S T U
## 0.2067417 0.4628846 0.2035928 0.1267809
table(is.na(df$URBANICITY))
##
## FALSE
## 19372
# change the entries to numerical values
dummies <- dummyVars(~URBANICITY, data = df)
URBANICITY_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
df %>% select(-URBANICITY), URBANICITY_encoded
)
# verify for any NA
sum(is.na(df$URBANICITYR))
## [1] 0
sum(is.na(df$URBANICITYS))
## [1] 0
sum(is.na(df$URBANICITYT))
## [1] 0
sum(is.na(df$URBANICITYU))
## [1] 0
There is “?” which could be NA
The metadata does not have “C” but the dataset has “C”
ACTION: use one hot encoding to impute values in URBANICITY
# lets explore the binary columns - DONATED , DONOR_GENDER, HOME_OWNER
table(unique(df$DONOR_GENDER))
##
## F M U
## 1 1 1
table(unique(df$HOME_OWNER))
##
## H N U
## 1 1 1
# DONOR_GENDER
table(df$DONOR_GENDER)
##
## F M U
## 9984 7633 1755
# DONOR-GENEDER is nominal categorical
# use one hot encoding
dummies <- dummyVars(~DONOR_GENDER, data = df)
DONOR_GENDER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
df %>% select(-DONOR_GENDER), DONOR_GENDER_encoded
)
# HOME_OWNER: whether donor is a home owner or not: H = yes, U = unknown, N = no
table(df$HOME_OWNER)
##
## H N U
## 9456 2151 7765
# use one-hot-encoding
dummies <- dummyVars(~HOME_OWNER, data = df)
HOME_OWNER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
df%>% select(-HOME_OWNER), HOME_OWNER_encoded
)
# verify the changes
head(df)
The DONOR_GENDER has ‘UKNOWN’ which is its own category, hence three categories with no order
The HOME_OWNER is also nominal categorical as it has three categories of no specific order. H = yes, U = unknown, N = no
# check the MEDIAN_HOME_VALUE
table(df$MEDIAN_HOME_VALUE[df$MEDIAN_HOME_VALUE == 0])
##
## 0
## 218
# rows with MEDIAN_HOME_VALUE of zero
zero_MEDIAN_HOME_VALUE <- which(df$MEDIAN_HOME_VALUE == 0)
print(df[zero_MEDIAN_HOME_VALUE, c(
"DONOR_AGE",
"INCOME_LEVEL",
"SES",
"MEDIAN_HOME_VALUE",
"MEDIAN_HOUSEHOLD_INCOME",
"DONATION_RESPONSE",
"MONTHS_SINCE_LAST_GIFT",
"LIFETIME_GIFT_COUNT",
"LIFETIME_GIFT_AMOUNT",
"LIFETIME_MAX_GIFT_AMT",
"LIFETIME_MIN_GIFT_AMT",
"LIFETIME_AVG_GIFT_AMT",
"EMAILS_12"
)])
## DONOR_AGE INCOME_LEVEL SES MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME
## 139 60 4 11 0 0
## 352 60 7 12 0 0
## 363 63 2 21 0 0
## 459 60 4 10 0 413000
## 674 75 7 11 0 237000
## 862 81 1 4 0 55000
## 1122 60 4 10 0 0
## 1123 60 4 43 0 0
## 1177 41 3 830 0 0
## 1243 60 4 31 0 0
## 1496 86 5 23 0 364000
## 1550 60 4 830 0 0
## 1551 60 4 830 0 0
## 1552 60 4 42 0 326000
## 1799 49 4 31 0 0
## 2008 73 1 32 0 0
## 2208 53 3 36 0 0
## 2223 76 2 36 0 0
## 2230 61 5 40 0 0
## 2357 60 4 39 0 0
## 2420 60 4 16 0 0
## 2661 60 4 30 0 0
## 2713 82 4 40 0 0
## 2810 37 7 35 0 0
## 2811 60 4 49 0 0
## 2825 51 5 35 0 0
## 2925 60 4 31 0 132000
## 2951 43 5 17 0 0
## 2994 60 4 42 0 0
## 3176 60 4 30 0 0
## 3233 51 3 51 0 0
## 3286 60 4 21 0 0
## 3400 60 4 14 0 0
## 3654 47 6 18 0 0
## 3827 60 4 1 0 58000
## 3837 60 1 10 0 50000
## 3877 60 4 31 0 0
## 3890 60 4 19 0 0
## 4235 78 2 830 0 0
## 4261 75 4 10 0 0
## 4271 60 1 26 0 344000
## 4410 60 4 10 0 0
## 4411 60 4 10 0 0
## 4412 60 4 10 0 0
## 4413 60 4 10 0 0
## 4414 60 4 10 0 0
## 4449 60 2 16 0 0
## 4556 27 2 9 0 0
## 4568 37 2 8 0 0
## 4569 60 4 7 0 0
## 4604 60 4 33 0 0
## 4605 60 4 21 0 0
## 4615 74 2 36 0 0
## 4766 60 1 31 0 0
## 4999 41 4 39 0 0
## 5126 51 4 36 0 0
## 5290 48 4 40 0 0
## 5484 60 4 41 0 0
## 5722 41 3 42 0 0
## 5872 34 4 49 0 0
## 5875 60 4 41 0 0
## 6538 80 4 23 0 429000
## 7364 39 3 10 0 0
## 7570 61 4 36 0 0
## 7802 51 7 24 0 0
## 8344 55 1 9 0 181000
## 8345 86 2 9 0 181000
## 8346 60 1 1 0 0
## 8348 37 1 10 0 118000
## 9278 38 4 10 0 54000
## 9280 55 4 1 0 0
## 9281 60 4 1 0 97000
## 9316 60 4 1 0 369000
## 9320 60 4 1 0 369000
## 9321 45 1 1 0 547000
## 9323 80 5 1 0 315000
## 9499 75 5 1 0 0
## 10210 60 4 10 0 0
## 10224 79 2 31 0 0
## 10259 45 2 34 0 0
## 10403 55 6 31 0 0
## 10575 57 5 13 0 0
## 10720 60 4 31 0 0
## 10733 53 4 18 0 0
## 10834 42 1 9 0 0
## 10857 60 3 12 0 0
## 10951 61 4 39 0 0
## 10994 60 4 32 0 0
## 11022 60 4 31 0 0
## 11023 57 4 31 0 0
## 11085 82 4 36 0 0
## 11153 60 4 25 0 0
## 11245 60 4 24 0 0
## 11258 60 4 15 0 0
## 11328 60 4 12 0 0
## 11338 37 1 30 0 0
## 11386 75 1 41 0 0
## 11448 60 4 34 0 0
## 11522 70 4 41 0 0
## 11610 35 5 24 0 291000
## 11618 37 4 21 0 272000
## 11739 49 5 16 0 0
## 11748 78 4 21 0 0
## 11758 60 1 21 0 0
## 11759 80 2 21 0 0
## 12016 55 2 41 0 0
## 12028 60 4 39 0 0
## 12054 60 4 31 0 0
## 12074 35 3 30 0 0
## 12081 78 3 10 0 0
## 12245 35 7 10 0 0
## 12247 60 4 22 0 0
## 12248 60 4 10 0 0
## 12295 21 6 35 0 0
## 12322 49 6 1 0 0
## 12386 42 4 38 0 0
## 12554 51 4 39 0 0
## 12568 74 2 9 0 202000
## 12689 57 3 31 0 0
## 12707 60 4 31 0 0
## 12848 59 2 31 0 0
## 12873 60 4 15 0 0
## 12891 37 6 18 0 0
## 12949 60 4 21 0 0
## 12986 60 4 1 0 382000
## 13028 69 1 4 0 0
## 13047 60 3 25 0 72000
## 13048 64 2 9 0 201000
## 13054 76 1 10 0 200000
## 13119 49 7 40 0 0
## 13259 60 1 31 0 0
## 13264 60 3 24 0 0
## 13292 29 3 41 0 0
## 13368 49 5 26 0 0
## 13401 60 4 51 0 55000
## 13417 40 5 39 0 0
## 13462 75 1 51 0 0
## 13713 27 1 29 0 0
## 13731 60 1 30 0 0
## 13734 85 1 30 0 0
## 13970 60 4 41 0 0
## 14011 75 4 24 0 0
## 14022 60 4 41 0 0
## 14041 45 3 44 0 0
## 14053 63 5 46 0 0
## 14165 55 4 8 0 0
## 14167 39 4 21 0 0
## 14239 60 4 10 0 113000
## 14399 60 4 10 0 0
## 14505 60 4 4 0 303000
## 14519 75 4 1 0 0
## 14557 60 4 6 0 258000
## 14578 70 4 33 0 178000
## 14921 60 4 34 0 0
## 15269 70 2 33 0 169000
## 15579 44 4 30 0 338000
## 15675 44 4 4 0 0
## 15773 52 4 830 0 0
## 15827 79 7 35 0 0
## 15831 60 4 1 0 333000
## 15843 39 4 17 0 314000
## 16025 61 5 20 0 0
## 16338 85 1 10 0 132000
## 16339 60 4 10 0 132000
## 16342 60 4 10 0 0
## 16345 43 1 9 0 192000
## 16418 51 4 15 0 0
## 16501 50 5 24 0 0
## 16676 60 4 10 0 58000
## 16691 47 4 10 0 112000
## 16698 60 4 25 0 74000
## 16772 60 4 9 0 263000
## 16842 60 4 34 0 0
## 16981 63 5 10 0 0
## 16982 60 4 21 0 0
## 17303 60 1 25 0 207000
## 17335 60 4 830 0 0
## 17459 60 4 10 0 0
## 17460 60 4 10 0 0
## 17461 60 4 10 0 0
## 17462 60 4 10 0 0
## 17463 60 4 10 0 0
## 17464 60 4 10 0 0
## 17465 60 4 10 0 0
## 17466 60 4 10 0 0
## 17467 60 4 10 0 0
## 17468 60 4 830 0 0
## 17469 60 4 830 0 0
## 17520 60 4 41 0 293000
## 17539 60 4 830 0 0
## 17540 60 4 830 0 0
## 17596 61 5 31 0 0
## 17666 60 4 16 0 65000
## 17710 55 3 8 0 0
## 17711 41 5 40 0 0
## 17721 60 5 25 0 0
## 17876 43 5 37 0 0
## 17880 49 3 27 0 0
## 17908 41 7 42 0 0
## 17953 60 4 15 0 0
## 17968 60 4 21 0 0
## 17974 25 1 18 0 0
## 18025 60 5 40 0 0
## 18068 60 4 10 0 102000
## 18087 35 4 10 0 0
## 18290 49 3 27 0 0
## 18337 60 1 10 0 0
## 18375 60 4 39 0 210000
## 18382 34 4 36 0 0
## 18491 60 2 30 0 0
## 18495 60 4 40 0 0
## 18894 60 4 10 0 0
## 18913 60 4 21 0 0
## 19040 60 4 10 0 0
## 19159 60 4 830 0 0
## 19306 60 2 51 0 0
## 19339 56 4 830 0 0
## 19344 79 5 830 0 0
## DONATION_RESPONSE MONTHS_SINCE_LAST_GIFT LIFETIME_GIFT_COUNT
## 139 0 18 6
## 352 6 4 13
## 363 0 15 9
## 459 0 18 6
## 674 0 16 15
## 862 3 18 8
## 1122 0 17 53
## 1123 0 18 12
## 1177 0 22 30
## 1243 0 21 8
## 1496 0 15 11
## 1550 0 15 3
## 1551 0 21 4
## 1552 0 18 3
## 1799 0 23 1
## 2008 0 16 7
## 2208 0 18 16
## 2223 42 17 13
## 2230 0 21 13
## 2357 0 15 11
## 2420 0 20 1
## 2661 0 17 7
## 2713 0 15 14
## 2810 7 23 5
## 2811 0 17 4
## 2825 0 21 11
## 2925 0 16 2
## 2951 0 17 1
## 2994 0 25 21
## 3176 0 26 37
## 3233 0 23 2
## 3286 0 15 12
## 3400 0 19 2
## 3654 0 18 4
## 3827 0 18 19
## 3837 0 18 5
## 3877 0 17 24
## 3890 0 15 7
## 4235 0 15 5
## 4261 0 16 25
## 4271 0 22 9
## 4410 0 24 13
## 4411 0 18 23
## 4412 0 24 11
## 4413 0 24 4
## 4414 0 18 4
## 4449 9 21 1
## 4556 0 25 16
## 4568 1 18 24
## 4569 0 15 5
## 4604 0 18 19
## 4605 0 16 2
## 4615 0 17 21
## 4766 0 18 4
## 4999 0 18 8
## 5126 0 17 9
## 5290 0 18 17
## 5484 0 16 6
## 5722 0 26 18
## 5872 0 24 2
## 5875 0 16 6
## 6538 0 20 1
## 7364 0 25 18
## 7570 0 16 12
## 7802 0 21 10
## 8344 0 18 8
## 8345 0 15 6
## 8346 0 18 5
## 8348 1 17 1
## 9278 0 18 7
## 9280 0 17 6
## 9281 0 20 1
## 9316 0 18 5
## 9320 0 20 21
## 9321 0 17 12
## 9323 15 19 16
## 9499 0 20 3
## 10210 0 21 2
## 10224 0 16 10
## 10259 0 23 12
## 10403 0 20 1
## 10575 6 17 1
## 10720 0 17 9
## 10733 0 17 10
## 10834 0 16 1
## 10857 0 19 2
## 10951 0 19 6
## 10994 0 15 4
## 11022 0 21 8
## 11023 0 20 6
## 11085 0 20 10
## 11153 0 20 2
## 11245 0 19 1
## 11258 0 15 5
## 11328 0 17 1
## 11338 0 26 4
## 11386 0 17 34
## 11448 0 18 4
## 11522 0 18 21
## 11610 5 23 2
## 11618 0 20 1
## 11739 1 16 1
## 11748 0 26 21
## 11758 0 15 10
## 11759 0 16 16
## 12016 0 18 6
## 12028 0 21 1
## 12054 0 18 10
## 12074 0 15 13
## 12081 12 21 2
## 12245 0 16 6
## 12247 0 17 27
## 12248 0 21 5
## 12295 7 24 16
## 12322 0 15 24
## 12386 0 18 22
## 12554 0 17 1
## 12568 0 16 17
## 12689 0 16 12
## 12707 0 22 10
## 12848 0 19 5
## 12873 0 24 14
## 12891 6 16 15
## 12949 0 20 1
## 12986 0 20 3
## 13028 0 18 12
## 13047 17 20 3
## 13048 0 19 16
## 13054 12 18 15
## 13119 0 25 11
## 13259 0 15 4
## 13264 0 24 34
## 13292 0 18 4
## 13368 0 20 1
## 13401 0 17 18
## 13417 2 15 11
## 13462 0 21 2
## 13713 0 24 14
## 13731 2 18 21
## 13734 18 16 6
## 13970 0 22 2
## 14011 0 18 5
## 14022 0 15 18
## 14041 0 18 2
## 14053 0 16 22
## 14165 0 26 17
## 14167 0 24 8
## 14239 0 16 1
## 14399 0 23 2
## 14505 0 15 15
## 14519 0 23 1
## 14557 0 22 1
## 14578 7 19 3
## 14921 0 24 5
## 15269 5 15 7
## 15579 1 18 6
## 15675 0 16 3
## 15773 0 17 4
## 15827 0 18 3
## 15831 0 21 1
## 15843 0 20 4
## 16025 0 16 8
## 16338 1 18 22
## 16339 0 16 2
## 16342 0 18 12
## 16345 4 21 2
## 16418 3 15 15
## 16501 17 18 2
## 16676 0 15 3
## 16691 0 18 16
## 16698 0 25 2
## 16772 0 21 1
## 16842 0 19 6
## 16981 0 18 28
## 16982 0 18 7
## 17303 0 18 16
## 17335 0 16 20
## 17459 0 18 6
## 17460 0 22 8
## 17461 0 15 20
## 17462 0 15 4
## 17463 0 22 8
## 17464 0 17 4
## 17465 0 17 5
## 17466 0 18 7
## 17467 0 23 3
## 17468 0 16 36
## 17469 0 20 3
## 17520 0 16 1
## 17539 0 16 9
## 17540 0 17 14
## 17596 0 20 16
## 17666 0 22 8
## 17710 0 18 6
## 17711 0 16 3
## 17721 1 18 16
## 17876 0 19 1
## 17880 1 20 5
## 17908 0 19 4
## 17953 0 22 16
## 17968 0 21 1
## 17974 0 17 5
## 18025 0 17 1
## 18068 0 18 5
## 18087 0 19 16
## 18290 0 24 3
## 18337 0 20 5
## 18375 0 15 12
## 18382 0 17 12
## 18491 4 24 2
## 18495 0 16 7
## 18894 0 16 3
## 18913 0 20 2
## 19040 0 5 23
## 19159 0 16 11
## 19306 0 21 1
## 19339 0 18 10
## 19344 0 21 2
## LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT LIFETIME_MIN_GIFT_AMT
## 139 15500 3000 2000
## 352 21300 2500 1000
## 363 6900 1000 500
## 459 33700 10000 2500
## 674 20400 5000 500
## 862 11859 2500 359
## 1122 49800 1500 500
## 1123 9500 1500 500
## 1177 31300 2500 300
## 1243 13600 3000 500
## 1496 18000 2500 500
## 1550 2200 1100 500
## 1551 3700 1700 500
## 1552 6000 2000 2000
## 1799 2500 2500 2500
## 2008 8600 1800 500
## 2208 37000 5000 500
## 2223 6800 1000 100
## 2230 12600 1300 500
## 2357 6600 2000 200
## 2420 2000 2000 2000
## 2661 4500 1000 400
## 2713 17400 2100 300
## 2810 7900 2500 300
## 2811 6100 2500 600
## 2825 17200 5000 300
## 2925 2000 1500 500
## 2951 2000 2000 2000
## 2994 13200 1000 300
## 3176 34900 1500 200
## 3233 6500 4000 2500
## 3286 7600 1000 300
## 3400 1900 1500 400
## 3654 8000 2500 1500
## 3827 10900 1000 500
## 3837 3000 1000 200
## 3877 25800 2200 200
## 3890 23100 5000 2500
## 4235 3700 1000 500
## 4261 11500 900 300
## 4271 10000 2000 200
## 4410 19400 2600 500
## 4411 12000 800 200
## 4412 27500 6000 500
## 4413 3800 1500 500
## 4414 4700 1500 1000
## 4449 1500 1500 1500
## 4556 10800 1000 300
## 4568 8500 1000 200
## 4569 5300 1500 500
## 4604 14100 1500 300
## 4605 2500 1500 1000
## 4615 25900 2000 700
## 4766 5900 2000 500
## 4999 7600 1500 500
## 5126 10000 1500 500
## 5290 11800 1500 200
## 5484 5500 2000 500
## 5722 15100 1200 300
## 5872 4000 2000 2000
## 5875 3200 700 300
## 6538 3000 3000 3000
## 7364 15800 1300 500
## 7570 5000 700 200
## 7802 9700 1500 500
## 8344 4500 1000 500
## 8345 8000 1500 1000
## 8346 5400 1400 600
## 8348 2000 2000 2000
## 9278 13500 3000 1000
## 9280 6800 2000 500
## 9281 2500 2500 2500
## 9316 6100 1300 1000
## 9320 19800 1900 300
## 9321 13900 2000 500
## 9323 8700 1100 200
## 9499 3200 1500 500
## 10210 2500 1500 1000
## 10224 4800 500 300
## 10259 7200 1000 500
## 10403 1500 1500 1500
## 10575 1500 1500 1500
## 10720 15000 3000 1000
## 10733 38000 10000 500
## 10834 2500 2500 2500
## 10857 3700 2200 1500
## 10951 3400 1000 400
## 10994 3300 1000 500
## 11022 7577 1577 300
## 11023 7200 2000 300
## 11085 12400 1800 500
## 11153 3500 2000 1500
## 11245 2500 2500 2500
## 11258 3200 1000 500
## 11328 1500 1500 1500
## 11338 9400 2700 2000
## 11386 11450 1500 150
## 11448 6200 2200 1000
## 11522 20600 1300 300
## 11610 2600 1600 1000
## 11618 5000 5000 5000
## 11739 1500 1500 1500
## 11748 21700 1500 300
## 11758 10400 1500 500
## 11759 16300 1700 200
## 12016 6600 2000 500
## 12028 2500 2500 2500
## 12054 19400 2500 500
## 12074 11600 1500 500
## 12081 4000 2500 1500
## 12245 20300 5200 2100
## 12247 20200 1100 300
## 12248 6100 2000 500
## 12295 14300 2000 300
## 12322 17100 1500 300
## 12386 7200 500 200
## 12554 1500 1500 1500
## 12568 15350 1500 250
## 12689 12600 1500 500
## 12707 10300 1700 500
## 12848 6300 2000 500
## 12873 23500 2500 300
## 12891 12400 1100 500
## 12949 2000 2000 2000
## 12986 4000 2500 500
## 13028 17900 2500 100
## 13047 5500 3000 500
## 13048 67300 5100 2500
## 13054 14600 1500 500
## 13119 20000 2500 1000
## 13259 4100 1500 500
## 13264 18500 1200 300
## 13292 8000 3000 1000
## 13368 2000 2000 2000
## 13401 8200 600 300
## 13417 12600 2000 500
## 13462 3200 1700 1500
## 13713 15700 2600 300
## 13731 7000 500 100
## 13734 10500 2500 1000
## 13970 3000 2000 1000
## 14011 3600 1000 500
## 14022 39100 5000 600
## 14041 5000 3000 2000
## 14053 27900 2500 300
## 14165 16400 2500 500
## 14167 12500 2000 1000
## 14239 2000 2000 2000
## 14399 1800 1500 300
## 14505 8200 800 300
## 14519 2500 2500 2500
## 14557 1500 1500 1500
## 14578 5700 2100 1500
## 14921 6600 2000 600
## 15269 29300 8000 500
## 15579 8500 2000 1000
## 15675 4500 2500 500
## 15773 9000 2500 2000
## 15827 5000 2500 1000
## 15831 2000 2000 2000
## 15843 4700 1500 1000
## 16025 4200 1200 300
## 16338 9900 600 300
## 16339 2600 1600 1000
## 16342 10900 1500 300
## 16345 2500 1500 1000
## 16418 11200 1500 500
## 16501 2500 2000 500
## 16676 2000 1000 500
## 16691 20900 2500 500
## 16698 4500 2500 2000
## 16772 2000 2000 2000
## 16842 5300 1500 300
## 16981 16100 1000 300
## 16982 7800 1700 200
## 17303 21800 3000 300
## 17335 20500 1500 300
## 17459 12300 5000 600
## 17460 9400 1500 500
## 17461 9800 1200 300
## 17462 4400 1500 500
## 17463 10500 2700 300
## 17464 6000 2000 1000
## 17465 8200 2100 1000
## 17466 8200 3000 500
## 17467 6000 2500 1500
## 17468 52200 3500 300
## 17469 3300 1700 500
## 17520 1500 1500 1500
## 17539 6700 2000 300
## 17540 6100 600 300
## 17596 29900 2700 1000
## 17666 20700 3500 500
## 17710 5400 1200 500
## 17711 4700 1700 1500
## 17721 13300 1800 300
## 17876 1500 1500 1500
## 17880 5400 1600 500
## 17908 3800 1500 500
## 17953 8600 700 300
## 17968 2000 2000 2000
## 17974 4400 1000 500
## 18025 2500 2500 2500
## 18068 4900 1500 200
## 18087 14900 1500 500
## 18290 5000 2000 1000
## 18337 7500 2500 500
## 18375 5100 600 300
## 18382 7500 1000 200
## 18491 3000 2000 1000
## 18495 12000 2500 500
## 18894 3200 1200 1000
## 18913 3000 2000 1000
## 19040 46600 5000 300
## 19159 6300 700 500
## 19306 2000 2000 2000
## 19339 14500 2500 1000
## 19344 3000 2500 500
## LIFETIME_AVG_GIFT_AMT EMAILS_12
## 139 2583 37
## 352 1638 37
## 363 767 36
## 459 5617 24
## 674 1360 22
## 862 1482 22
## 1122 940 29
## 1123 792 15
## 1177 1043 24
## 1243 1700 33
## 1496 1636 21
## 1550 733 17
## 1551 925 11
## 1552 2000 9
## 1799 2500 13
## 2008 1229 21
## 2208 2313 19
## 2223 523 20
## 2230 969 19
## 2357 600 21
## 2420 2000 15
## 2661 643 19
## 2713 1243 20
## 2810 1580 19
## 2811 1525 19
## 2825 1564 19
## 2925 1000 15
## 2951 2000 14
## 2994 629 16
## 3176 943 10
## 3233 3250 19
## 3286 633 20
## 3400 950 17
## 3654 2000 19
## 3827 574 18
## 3837 600 18
## 3877 1075 20
## 3890 3300 19
## 4235 740 20
## 4261 460 20
## 4271 1111 18
## 4410 1492 18
## 4411 522 16
## 4412 2500 19
## 4413 950 16
## 4414 1175 16
## 4449 1500 11
## 4556 675 16
## 4568 354 19
## 4569 1060 16
## 4604 742 19
## 4605 1250 15
## 4615 1233 14
## 4766 1475 19
## 4999 950 18
## 5126 1111 15
## 5290 694 19
## 5484 917 20
## 5722 839 13
## 5872 2000 13
## 5875 533 19
## 6538 3000 16
## 7364 878 17
## 7570 417 12
## 7802 970 17
## 8344 563 18
## 8345 1333 13
## 8346 1080 20
## 8348 2000 14
## 9278 1929 19
## 9280 1133 19
## 9281 2500 16
## 9316 1220 18
## 9320 943 19
## 9321 1158 19
## 9323 544 15
## 9499 1067 20
## 10210 1250 14
## 10224 480 20
## 10259 600 16
## 10403 1500 11
## 10575 1500 14
## 10720 1667 19
## 10733 3800 21
## 10834 2500 17
## 10857 1850 16
## 10951 567 19
## 10994 825 15
## 11022 947 19
## 11023 1200 15
## 11085 1240 19
## 11153 1750 16
## 11245 2500 14
## 11258 640 18
## 11328 1500 14
## 11338 2350 17
## 11386 337 11
## 11448 1550 19
## 11522 981 17
## 11610 1300 14
## 11618 5000 16
## 11739 1500 14
## 11748 1033 15
## 11758 1040 19
## 11759 1019 19
## 12016 1100 19
## 12028 2500 17
## 12054 1940 19
## 12074 892 17
## 12081 2000 19
## 12245 3383 21
## 12247 748 17
## 12248 1220 18
## 12295 894 18
## 12322 713 19
## 12386 327 19
## 12554 1500 15
## 12568 903 19
## 12689 1050 18
## 12707 1030 16
## 12848 1260 19
## 12873 1679 17
## 12891 827 21
## 12949 2000 14
## 12986 1333 20
## 13028 1492 19
## 13047 1833 20
## 13048 4206 20
## 13054 973 19
## 13119 1818 19
## 13259 1025 19
## 13264 544 19
## 13292 2000 19
## 13368 2000 14
## 13401 456 20
## 13417 1145 21
## 13462 1600 14
## 13713 1121 18
## 13731 333 18
## 13734 1750 19
## 13970 1500 14
## 14011 720 20
## 14022 2172 20
## 14041 2500 19
## 14053 1268 17
## 14165 965 15
## 14167 1563 17
## 14239 2000 16
## 14399 900 14
## 14505 547 13
## 14519 2500 14
## 14557 1500 11
## 14578 1900 18
## 14921 1320 18
## 15269 4186 23
## 15579 1417 19
## 15675 1500 20
## 15773 2250 21
## 15827 1667 18
## 15831 2000 14
## 15843 1175 14
## 16025 525 19
## 16338 450 19
## 16339 1300 17
## 16342 908 19
## 16345 1250 19
## 16418 747 15
## 16501 1250 16
## 16676 667 17
## 16691 1306 19
## 16698 2250 18
## 16772 2000 14
## 16842 883 19
## 16981 575 18
## 16982 1114 17
## 17303 1363 19
## 17335 1025 20
## 17459 2050 20
## 17460 1175 18
## 17461 490 19
## 17462 1100 18
## 17463 1313 19
## 17464 1500 18
## 17465 1640 19
## 17466 1171 19
## 17467 2000 20
## 17468 1450 20
## 17469 1100 16
## 17520 1500 14
## 17539 744 17
## 17540 436 20
## 17596 1869 19
## 17666 2588 19
## 17710 900 19
## 17711 1567 20
## 17721 831 12
## 17876 1500 14
## 17880 1080 21
## 17908 950 16
## 17953 538 18
## 17968 2000 16
## 17974 880 20
## 18025 2500 16
## 18068 980 14
## 18087 931 19
## 18290 1667 17
## 18337 1500 19
## 18375 425 20
## 18382 625 22
## 18491 1500 14
## 18495 1714 13
## 18894 1067 6
## 18913 1500 7
## 19040 2026 44
## 19159 573 6
## 19306 2000 16
## 19339 1450 19
## 19344 1500 19
new_df <- df
There is 0 entries for median home value. This is likely a data entry error because home value cannot be zero
ACTION: Observe for now, deletion may affect the model.
# confirm all changes
table(new_df$MEDIAN_HOME_VALUE[new_df$MEDIAN_HOME_VALUE == 0])
##
## 0
## 218
str(new_df)
## 'data.frame': 19372 obs. of 27 variables:
## $ CONTROL_NUMBER : int 5 12 37 38 41 52 53 67 70 71 ...
## $ DONATED : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ DONATION_AMT : num 0 1000 0 0 0 0 0 500 0 1600 ...
## $ DONOR_AGE : int 87 79 75 60 74 63 71 79 41 63 ...
## $ INCOME_LEVEL : num 2 7 5 6 2 3 5 1 4 4 ...
## $ SES : num 830 45 11 4 49 8 50 28 30 43 ...
## $ MEDIAN_HOME_VALUE : num 554000 334000 2388000 1688000 514000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 8 0 0 0 0 0 ...
## $ MONTHS_SINCE_LAST_GIFT : int 26 7 6 6 18 19 21 9 17 9 ...
## $ EMAILS_12 : int 17 28 43 44 25 29 25 33 36 38 ...
## $ LIFETIME_GIFT_COUNT : int 35 25 36 37 12 22 20 20 34 22 ...
## $ LIFETIME_EMAILS : int 19 32 44 31 30 22 29 33 36 27 ...
## $ LIFETIME_GIFT_AMOUNT : num 29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 500 500 300 500 300 500 300 300 300 ...
## $ LIFETIME_AVG_GIFT_AMT : int 849 1472 1675 1176 883 582 1100 505 503 682 ...
## $ URBANICITYR : num 0 1 0 0 1 0 1 0 0 1 ...
## $ URBANICITYS : num 1 0 1 0 0 0 0 1 1 0 ...
## $ URBANICITYT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ URBANICITYU : num 0 0 0 1 0 1 0 0 0 0 ...
## $ DONOR_GENDERF : num 0 0 1 1 1 0 0 1 1 1 ...
## $ DONOR_GENDERM : num 1 1 0 0 0 1 1 0 0 0 ...
## $ DONOR_GENDERU : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HOME_OWNERH : num 1 1 1 1 0 0 1 1 1 1 ...
## $ HOME_OWNERN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HOME_OWNERU : num 0 0 0 0 1 1 0 0 0 0 ...
# Convert target variable to factor
new_df$DONATED <- as.factor(new_df$DONATED)
# reference category
new_df$DONOR_GENDERU <- NULL
new_df$HOME_OWNERU <- NULL
new_df$URBANICITYU <- NULL
# distribution of target variable
ggplot(new_df, aes(x = DONATED)) +
geom_bar(fill = "steelblue") +
labs(title="Distribution of donors who donated") +
theme(axis.text = element_text(angle = 45, vjust = 0.5, hjust = 1))
#### Key findings
A high percentage of donors did not make a donation to the latest campaign. This may affect the model prediction.
# create a correlation plot for the numerical variables
numeric_cols <- new_df %>%
select(c("DONOR_AGE",
"INCOME_LEVEL",
"SES",
"MEDIAN_HOME_VALUE",
"MEDIAN_HOUSEHOLD_INCOME",
"DONATION_RESPONSE",
"MONTHS_SINCE_LAST_GIFT",
"LIFETIME_GIFT_COUNT",
"LIFETIME_GIFT_AMOUNT",
"LIFETIME_MAX_GIFT_AMT",
"LIFETIME_MIN_GIFT_AMT",
"LIFETIME_AVG_GIFT_AMT",
"EMAILS_12"
))
par(mar = c(2,2,2,2)) # expand margins because the plot is small
# visualize the corr plot
corrplot(cor(numeric_cols),
method = 'color',
type = 'upper',
addCoef.col = 'black',
tl.srt = 45,
tl.col = "black",
number.cex = 0.55
)
#### Key findings high correlation between MEDIAN_HOME_VALUE and
MEDIAN_HOUSEHOLD_INCOME - 0.68 is expected as a higher income is likely
to live in a high median home value.
high correlation between LIFETIME_GIFT_COUNT and LIFETIME_GIFT_AMOUNT - 0.65 - the more the gifts the more the counts
high correlation between LIFETIME_MAX_GIFT_AMT and LIFETIME_MIN_GIFT_AMT - 0.73
high correlation between LIFETIME_AVG_GIFT_AMT and LIFETIME_MIN_GIFT_AMT - 0.80 - this is very high, could they be measuring the same thing?
# investigate the relation between the target variable and numerical variables
# for the firs 6 numerical variables
# reset the margins
par(mar = c(4,4,2,1)) # the grid is made of 2 rows and 4 columns
par(mfrow = c(2,4)) # multiple figures filled row by row in the set 2 rows bt 4 columns.
numeric_cols <- new_df %>%
select(c("DONOR_AGE",
"INCOME_LEVEL",
"SES",
"MEDIAN_HOME_VALUE",
"MEDIAN_HOUSEHOLD_INCOME",
"DONATION_RESPONSE"
))
for (variable in names(numeric_cols)) {
boxplot(new_df[[variable]] ~ new_df$DONATED,
main = variable,
xlab = "Donated",
ylab = variable,
col = c("orange", "steelblue"),
outline = FALSE) # FALSE to not show the outliers
}
# investigate the relation between the target variable and numerical variables
# for the second 6 numerical variables
# reset the margins
par(mar = c(4,4,2,1))
par(mfrow = c(2,4))
numeric_cols <- df %>%
select(c("MONTHS_SINCE_LAST_GIFT",
"LIFETIME_GIFT_COUNT",
"LIFETIME_GIFT_AMOUNT",
"LIFETIME_MAX_GIFT_AMT",
"LIFETIME_MIN_GIFT_AMT",
"LIFETIME_AVG_GIFT_AMT",
"EMAILS_12"
))
for (variable in names(numeric_cols)) {
boxplot(new_df[[variable]] ~ new_df$DONATED,
main = variable,
xlab = "Donated",
ylab = variable,
col = c("orange", "steelblue"),
outline = FALSE) # FALSE to not show the outliers
}
#### Key findings
DONOR_AGE, INCOME_LEVEL, SES are a weak predictor as it does not separate those who donated from those who did not donate.
MEDIAN_HOME_VALUE, MEDIAN_HOUSEHOLD_INCOME and DONATION_RESPONSE have the blue box slightly higher than the orange which means that those who donated are likely to have a higher median value home,
higher median household income and are likely more responsive. These are strong predictors for the model.
Strong predictors:
MONTHS_SINCE_LAST_GIFT shows that who didnt donate have more months since their last gift.
LIFETIME_MAX_GIFT_AMT,LIFETIME_AVG_GIFT_AMT shows a difference with those who failed to donate as higher than those who didn’t.
LIFETIME_GIFT_AMOUNT shows those who have given more gifts were slightly higher than those who didn’t.
EMAILS_12 shows that those who receieved more campaignm emails were more likley to give compared to those who didnt.
Weak predictors:
LIFETIME_MIN_GIFT_AMT shows no difference between donors and non-donors
new_df$DONATION_AMT <- NULL
new_df$CONTROL_NUMBER <- NULL
# last check before splitting the data
str(new_df)
## 'data.frame': 19372 obs. of 22 variables:
## $ DONATED : Factor w/ 2 levels "FALSE","TRUE": 1 2 1 1 1 1 1 2 1 2 ...
## $ DONOR_AGE : int 87 79 75 60 74 63 71 79 41 63 ...
## $ INCOME_LEVEL : num 2 7 5 6 2 3 5 1 4 4 ...
## $ SES : num 830 45 11 4 49 8 50 28 30 43 ...
## $ MEDIAN_HOME_VALUE : num 554000 334000 2388000 1688000 514000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 8 0 0 0 0 0 ...
## $ MONTHS_SINCE_LAST_GIFT : int 26 7 6 6 18 19 21 9 17 9 ...
## $ EMAILS_12 : int 17 28 43 44 25 29 25 33 36 38 ...
## $ LIFETIME_GIFT_COUNT : int 35 25 36 37 12 22 20 20 34 22 ...
## $ LIFETIME_EMAILS : int 19 32 44 31 30 22 29 33 36 27 ...
## $ LIFETIME_GIFT_AMOUNT : num 29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 500 500 300 500 300 500 300 300 300 ...
## $ LIFETIME_AVG_GIFT_AMT : int 849 1472 1675 1176 883 582 1100 505 503 682 ...
## $ URBANICITYR : num 0 1 0 0 1 0 1 0 0 1 ...
## $ URBANICITYS : num 1 0 1 0 0 0 0 1 1 0 ...
## $ URBANICITYT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DONOR_GENDERF : num 0 0 1 1 1 0 0 1 1 1 ...
## $ DONOR_GENDERM : num 1 1 0 0 0 1 1 0 0 0 ...
## $ HOME_OWNERH : num 1 1 1 1 0 0 1 1 1 1 ...
## $ HOME_OWNERN : num 0 0 0 0 0 0 0 0 0 0 ...
set.seed(123)
# create an index
train_index <- createDataPartition(new_df$DONATED, p = 0.8, list = FALSE)
# Split index into train and test
train_data <- new_df[train_index, ]
test_data <- new_df[-train_index, ]
# check the total observations
total_observations <- sum(nrow(train_data) + nrow(test_data))
# verify
nrow(train_data)
## [1] 15499
nrow(test_data)
## [1] 3873
total_observations
## [1] 19372
prop.table(table(train_data$DONATED))
##
## FALSE TRUE
## 0.7499839 0.2500161
prop.table(table(test_data$DONATED))
##
## FALSE TRUE
## 0.7500645 0.2499355
model <- glm(DONATED~.,family = "binomial", data = train_data)
summary(model)
##
## Call:
## glm(formula = DONATED ~ ., family = "binomial", data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.417e-01 2.064e-01 -3.593 0.000326 ***
## DONOR_AGE 2.360e-03 1.460e-03 1.616 0.106076
## INCOME_LEVEL 5.372e-02 1.424e-02 3.772 0.000162 ***
## SES 2.562e-04 1.459e-04 1.756 0.079074 .
## MEDIAN_HOME_VALUE 1.475e-07 2.713e-08 5.436 5.45e-08 ***
## MEDIAN_HOUSEHOLD_INCOME -8.287e-08 1.636e-07 -0.507 0.612353
## DONATION_RESPONSE 1.069e-03 1.935e-03 0.552 0.580668
## MONTHS_SINCE_LAST_GIFT -4.196e-02 5.529e-03 -7.588 3.24e-14 ***
## EMAILS_12 -5.969e-03 4.179e-03 -1.428 0.153212
## LIFETIME_GIFT_COUNT 1.661e-02 4.637e-03 3.583 0.000340 ***
## LIFETIME_EMAILS -4.447e-03 3.893e-03 -1.142 0.253301
## LIFETIME_GIFT_AMOUNT 3.352e-06 3.812e-06 0.879 0.379213
## LIFETIME_MAX_GIFT_AMT 9.168e-06 1.857e-05 0.494 0.621580
## LIFETIME_MIN_GIFT_AMT 3.272e-05 6.281e-05 0.521 0.602401
## LIFETIME_AVG_GIFT_AMT -2.649e-04 7.332e-05 -3.613 0.000303 ***
## URBANICITYR 1.400e-01 7.348e-02 1.905 0.056726 .
## URBANICITYS 1.861e-01 6.397e-02 2.909 0.003629 **
## URBANICITYT 2.048e-01 7.215e-02 2.839 0.004526 **
## DONOR_GENDERF 8.004e-03 6.782e-02 0.118 0.906050
## DONOR_GENDERM -3.556e-02 6.984e-02 -0.509 0.610638
## HOME_OWNERH 2.347e-02 4.294e-02 0.547 0.584664
## HOME_OWNERN 5.388e-02 6.862e-02 0.785 0.432306
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17432 on 15498 degrees of freedom
## Residual deviance: 17079 on 15477 degrees of freedom
## AIC: 17123
##
## Number of Fisher Scoring iterations: 4
null_deviance <- 17432
residual_deviance <- 17079
pct_diff <- (null_deviance - residual_deviance) / null_deviance * 100
pct_diff
## [1] 2.025011
Based on the model, the following features are significant:
INCOME_LEVEL - those with a higher income level are likely to make a donation
MEDIAN_HOME_VALUE - individuals with a higher median home value likely have a higher median household income and are likely to donate.
MONTHS_SINCE_LAST_GIFT - if its been long since last gift, its likely that there would be no donation.
LIFETIME_GIFT_COUNT - The more gift counts the more likely to make a donation as we saw in the correlation plot.
LIFETIME_AVG_GIFT_AMT - higher less likely to donate becasue im assuming these individuals make big contributions once in a while
URBANICITYS & URBANICITYT - more donors are likley to come from subsurban areas or small towns as compared to Large urba cities and rural areas which are high stress areas.
The null deviance : 1.7432^{4} and the residual deviance 1.7079^{4} give a difference of 2.0250115% which means that the model only learned 2.0250115%
# evaluate the performance of the model using the validation data set
prediction_prob <- predict(model, newdata = test_data, type = "response")
summary(prediction_prob) # gives min,max,mean of predicted probabilities.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000206 0.2044861 0.2418318 0.2477663 0.2807676 0.7192983
prediction_class <- ifelse(prediction_prob > 0.3, TRUE, FALSE) # my dataset is unequal with more FALSE than TRUE, hence i lowered the threshold to 0.3 from 0.5
# lets create a confusion matrix
confusion_mtx <- confusionMatrix(
as.factor(prediction_class),
as.factor(test_data$DONATED),
positive = 'TRUE'
)
print(confusion_mtx)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2485 711
## TRUE 420 257
##
## Accuracy : 0.708
## 95% CI : (0.6934, 0.7223)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1344
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.26550
## Specificity : 0.85542
## Pos Pred Value : 0.37962
## Neg Pred Value : 0.77753
## Prevalence : 0.24994
## Detection Rate : 0.06636
## Detection Prevalence : 0.17480
## Balanced Accuracy : 0.56046
##
## 'Positive' Class : TRUE
##
# extract the output
accuracy <- confusion_mtx$overall ['Accuracy']
true_pos_rate <- confusion_mtx$byClass ['Sensitivity']
true_neg_rate <- confusion_mtx$byClass ['Specificity']
mean_TRUE_pred <- round(0.2477663, 2) * 100
The model’s results are Accuracy : 71% with a TPR: 27% and a TNR: 86%.
The model captured 27% of actual donors and 86% of actual non-donors. However the data consisted of higher FALSE - non-donors than non-donors so the model didnt learn much hence a weak prediction.
The question is why the model missed 708 actual donors because the goal for the compaign is to get more donors which means more revenue.
In this model, the average of the positively or TRUE DONOR predictions is 25 which means that the model was not well trained.
Based on all the above, i conclude that the model, at a sensitivity of 27%, means that it failed to capture any new donors hence predicted FALSE for the majority of
individuals.This may likely be due to class imbalance or low predictive power in the features used in the model.
PLAN: improve the model
-GOAL: to add more TRUE samples that the model can learn from but realistically where the non-donors will always outnumber the donors.
# Train the logistic regression model on multiple (different but same size) random subsets of the data to reduce variance and bias.
num_models <- 10
subset_size <- nrow(train_data) # multiple (different but same size) random subsets of the data
models<- list()
for (i in 1:num_models) {
sample_indices <- sample(1:nrow(train_data),
size = subset_size,
replace = TRUE)
subset <- train_data[sample_indices, ]
weight_ratio <- sum(as.character(subset$DONATED) == "FALSE") / sum(as.character(subset$DONATED) == "TRUE")
# train logistic regression with class weights
models[[i]] <- glm(DONATED~.,
data = subset,
family = 'binomial',
weights = ifelse(subset$DONATED == "TRUE", weight_ratio, 1)
)
}
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
10 models were created.
# obtain all predictions from the ten models
ten_predictions <- sapply(models, function (m){
predict(m, newdata = test_data, type = 'response')
})
# get the average of all ten probabilities
ensemble_average <- rowMeans(ten_predictions)
# used a threshold of 0.3 because we want the model to capture more TRUE in a imbalanced dataset
ensemble_class <- ifelse(ensemble_average > 0.3, TRUE, FALSE)
ensemble_class <- as.factor(ensemble_class)
Expecting matrix of ten columns for each model and rows (number of test_data rows * 10 for the ten models)
# create a confusion matrix for the ensemble class
confusion_mtx_ensemble <- confusionMatrix(ensemble_class, test_data$DONATED, positive = "TRUE")
print(confusion_mtx_ensemble)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 24 11
## TRUE 2881 957
##
## Accuracy : 0.2533
## 95% CI : (0.2397, 0.2673)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0016
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.988636
## Specificity : 0.008262
## Pos Pred Value : 0.249349
## Neg Pred Value : 0.685714
## Prevalence : 0.249935
## Detection Rate : 0.247095
## Detection Prevalence : 0.990963
## Balanced Accuracy : 0.498449
##
## 'Positive' Class : TRUE
##
# extract the output
accuracy <- confusion_mtx_ensemble$overall ['Accuracy']
true_pos_rate <- confusion_mtx_ensemble$byClass ['Sensitivity']
true_neg_rate <- confusion_mtx_ensemble$byClass ['Specificity']
The model’s results are Accuracy : 25% with a TPR: 99% and a TNR: 1%.
The results suggest that the model became more aggressive in capturing the TRUE cases at 958 and completely ignored the FALSE cases at 2880. Hence poor performnace.
The data is imbalanced, lets see if we can address this using the F1 score.
# lets calculate the f1 score
# 2 * ((precision * recall) / (precision + recall))
TP <- 958
FP <- 2880
FN <- 10
precision <- TP / (TP + FP)
precision
## [1] 0.2496092
recall <- TP / (TP + FN)
recall
## [1] 0.9896694
f1_score <- 2 * (
(precision * recall) / (precision + recall)
)
f1_score
## [1] 0.3986683
With a recall of 0.99% and a precision of 0.25%, we get an f1 score of 0.399 which means that the model is willing to capture TRUE’s 99% of the time but it is wrong 25% of the time.
ACTION: Try a higher threshold?
# increase the threshold to 0.5
ensemble_class_05 <- ifelse(ensemble_average > 0.5, TRUE, FALSE)
ensemble_class_05 <- as.factor(ensemble_class_05)
# confusion matrix with a threshold of 0.5
confusion_mtx_ensemble_05 <- confusionMatrix(ensemble_class_05, test_data$DONATED, positive = "TRUE")
print(confusion_mtx_ensemble_05)
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 1776 412
## TRUE 1129 556
##
## Accuracy : 0.6021
## 95% CI : (0.5865, 0.6176)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.149
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.5744
## Specificity : 0.6114
## Pos Pred Value : 0.3300
## Neg Pred Value : 0.8117
## Prevalence : 0.2499
## Detection Rate : 0.1436
## Detection Prevalence : 0.4351
## Balanced Accuracy : 0.5929
##
## 'Positive' Class : TRUE
##
### Calculation of F1 SCORE for both 0.5 threshold
# 2 * ((precision * recall) / (precision + recall))
TP_05<- 554
FP_05 <- 1138
FN_05 <- 414
precision_05 <- TP_05 / (TP_05 + FP_05)
precision_05
## [1] 0.3274232
recall_05 <- TP_05 / (TP_05 + FN_05)
recall_05
## [1] 0.572314
f1_score_05 <- 2 * (
(precision_05 * recall_05) / (precision_05 + recall_05)
)
f1_score_05
## [1] 0.4165414
# threshold 0.7
# increase the threshold to 0.5
ensemble_class_07 <- ifelse(ensemble_average > 0.7, TRUE, FALSE)
ensemble_class_07 <- as.factor(ensemble_class_07)
# confusion matrix with a threshold of 0.5
confusion_mtx_ensemble_07 <- confusionMatrix(ensemble_class_07, test_data$DONATED, positive = "TRUE")
confusion_mtx_ensemble_07
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2873 946
## TRUE 32 22
##
## Accuracy : 0.7475
## 95% CI : (0.7335, 0.7611)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 0.6526
##
## Kappa : 0.0171
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.02273
## Specificity : 0.98898
## Pos Pred Value : 0.40741
## Neg Pred Value : 0.75229
## Prevalence : 0.24994
## Detection Rate : 0.00568
## Detection Prevalence : 0.01394
## Balanced Accuracy : 0.50586
##
## 'Positive' Class : TRUE
##
### Calculation of F1 SCORE for 0.7 threshold
# 2 * ((precision * recall) / (precision + recall))
TP_07<- 23
FP_07 <- 33
FN_07 <- 945
precision_07 <- TP_07 / (TP_07 + FP_07)
precision_07
## [1] 0.4107143
recall_07 <- TP_07 / (TP_07 + FN_07)
recall_07
## [1] 0.02376033
f1_score_07 <- 2 * (
(precision_07 * recall_07) / (precision_07 + recall_07)
)
f1_score_07
## [1] 0.04492188
With a threshold of 0.3, the F1 SCORE is very low so i increased the threshold to 0.5 and 0.7 and calculated them separately.
With a threshold of 0.5, the recall is 0.572 and precision of 0.327, we get an F1 SCORE of 0.417
With a threshold of 0.7, the recall is 0.024 and precision of 0.411, we get an F1 SCORE of 0.045
0.3 threshold has poor precision and very high recall.
0.7 has good precision but very poor recall.
Considering we have imbalanced data, 0.5 is the best threshold with an F1 SCORE of 0.417
set.seed(123)
# Build the model
rf_model <- randomForest(
DONATED~.,data = train_data,
ntree = 100,
classwt = c("FALSE" = 1, "TRUE"= 3), # this is because i want the model to capture more TRUE cases compared to FALSE cases.
importance = TRUE
)
# evaluate with test data
rf_predictions <- predict(rf_model ,newdata = test_data)
# create a cofusion matrix for the random forest model
rf_confusion_matrix <- confusionMatrix(rf_predictions, test_data$DONATED, positive = 'TRUE')
rf_confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2893 957
## TRUE 12 11
##
## Accuracy : 0.7498
## 95% CI : (0.7359, 0.7634)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 0.5234
##
## Kappa : 0.0107
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.011364
## Specificity : 0.995869
## Pos Pred Value : 0.478261
## Neg Pred Value : 0.751429
## Prevalence : 0.249935
## Detection Rate : 0.002840
## Detection Prevalence : 0.005939
## Balanced Accuracy : 0.503616
##
## 'Positive' Class : TRUE
##
# extract the output
accuracy <- rf_confusion_matrix$overall ['Accuracy']
true_pos_rate <- rf_confusion_matrix$byClass ['Sensitivity']
true_neg_rate <- rf_confusion_matrix$byClass ['Specificity']
plot(rf_model)
The model has performed poorly with Accuracy : 75%, a TPR: 1% and a TNR: 100%.
The sensitivity has dropped meaning that the model can barely capture any TRUE donor cases.
ACTION: Modify the weight and ntrees.
set.seed(123)
# Build rf model II with increased weight for TRUE cases and increased ntrees to 500
rf_model_2 <- randomForest(
DONATED~.,data = train_data,
ntree = 500, # increased the number of trees to 500
classwt = c("FALSE" = 1, "TRUE"= 5), # this is because i want the model to capture more TRUE cases compared to FALSE cases.
importance = TRUE
)
# evaluate with test data
rf_predictions_2 <- predict(rf_model_2 ,newdata = test_data)
# create a cofusion matrix for the random forest model
rf_confusion_matrix_2 <- confusionMatrix(rf_predictions_2, test_data$DONATED, positive = 'TRUE')
rf_confusion_matrix_2
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2895 958
## TRUE 10 10
##
## Accuracy : 0.7501
## 95% CI : (0.7361, 0.7636)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 0.5086
##
## Kappa : 0.0102
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.010331
## Specificity : 0.996558
## Pos Pred Value : 0.500000
## Neg Pred Value : 0.751363
## Prevalence : 0.249935
## Detection Rate : 0.002582
## Detection Prevalence : 0.005164
## Balanced Accuracy : 0.503444
##
## 'Positive' Class : TRUE
##
# extract the output
accuracy <- rf_confusion_matrix_2$overall ['Accuracy']
true_pos_rate <- rf_confusion_matrix_2$byClass ['Sensitivity']
true_neg_rate <- rf_confusion_matrix_2$byClass ['Specificity']
plot(rf_model_2)
The weight was increased for TRUE cases to 5 and FALSE cases maintained at 1. This is telling the model to put an emphasis on capturing TRUE cases more than FALSE.
Results show Accuracy : 75% with a TPR: 1% and a TNR: 100%.
The model does not learn anything and captures FALSE cases only missing 958 potential TRUE donors.
ACTION: Read about sampsize function that can be used instead of weights especially for an imbalanced data set.
For my case, sampsize would mean that each decision tree is forced to train on the a specific number of FALSE and TRUE cases.
There are 11624 FALSE cases and 3875 TRUE cases in training. so we can start with a baseline of 3875 FALSE and 3875 TRUE.
# Build rf model III
rf_model_3 <- randomForest(
DONATED~.,
data = train_data,
ntree = 500,
sampsize = c("FALSE" = 3875, "TRUE" = 3875),
importance = TRUE
)
rf_predictions_3 <- predict(rf_model_3, newdata = test_data, type = "prob")
# convert to class labels using a threshold of 0.5
rf_class_3 <- ifelse(rf_predictions_3[,"TRUE"] > 0.5, TRUE, FALSE)
rf_class_3 <- as.factor(rf_class_3)
# create a confusion matrix
rf_confusion_matrix_3 <- confusionMatrix(rf_class_3, test_data$DONATED, positive = "TRUE")
rf_confusion_matrix_3
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2407 687
## TRUE 498 281
##
## Accuracy : 0.694
## 95% CI : (0.6793, 0.7085)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1271
##
## Mcnemar's Test P-Value : 4.726e-08
##
## Sensitivity : 0.29029
## Specificity : 0.82857
## Pos Pred Value : 0.36072
## Neg Pred Value : 0.77796
## Prevalence : 0.24994
## Detection Rate : 0.07255
## Detection Prevalence : 0.20114
## Balanced Accuracy : 0.55943
##
## 'Positive' Class : TRUE
##
plot(rf_model_3)
# calculate F1 SCORE for rf model 3 with 0.5 as the threshold
TP_rf<- 292
FP_rf <- 503
FN_rf <- 676
precision_rf <- TP_rf / (TP_rf + FP_rf)
precision_rf
## [1] 0.3672956
recall_rf <- TP_rf / (TP_rf + FN_rf)
recall_rf
## [1] 0.3016529
f1_score_rf <- 2 * (
(precision_rf * recall_rf) / (precision_rf + recall_rf)
)
f1_score_rf
## [1] 0.3312535
The F1 SCORE for the ensemble random forest with ntrees at 500 and using sample size of 3875 for both TRUE cases and FALSE cases, is 0.331 , which is lower than the homogenous logistic regression that had an F1 SCORE of 0.417.
With a recall of 0.302 and precision of 0.367, means that the random forest model is able to capture TRUE cases at 0.302 but misses out on potential TRUE donors at 0.367 .
# we are using ensemble to combine ensemble_class_05 and rf_model_3
# 1. obtain probabilities for each model
logistic_reg_prob <- ensemble_average
random_fst_prob <- rf_predictions_3[,"TRUE"]
# 2. Assign weights based on performance. The F1 SCORE tells which model performed better
wght_logistic_reg <- f1_score_05 / (f1_score_05 + f1_score_rf)
wght_random_fst <- f1_score_rf / (f1_score_rf + f1_score_05)
# 3. combine the weighted average
final_prob <- (logistic_reg_prob * wght_logistic_reg) + (random_fst_prob * wght_random_fst)
#final_prob
# 4. Apply threshold
final_class <- ifelse(final_prob > 0.5, TRUE, FALSE)
final_class <- as.factor(final_class)
# 5. confurion matrix
het_confusion_matrix <- confusionMatrix(final_class,test_data$DONATED, positive = "TRUE")
het_confusion_matrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 2130 554
## TRUE 775 414
##
## Accuracy : 0.6569
## 95% CI : (0.6417, 0.6718)
## No Information Rate : 0.7501
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1495
##
## Mcnemar's Test P-Value : 1.592e-09
##
## Sensitivity : 0.4277
## Specificity : 0.7332
## Pos Pred Value : 0.3482
## Neg Pred Value : 0.7936
## Prevalence : 0.2499
## Detection Rate : 0.1069
## Detection Prevalence : 0.3070
## Balanced Accuracy : 0.5805
##
## 'Positive' Class : TRUE
##
# F1 SCORE for the heterogenous model
TP_het<- 422
FP_het <- 742
FN_het <- 546
precision_het <- TP_het / (TP_het + FP_het)
precision_het
## [1] 0.362543
recall_het <- TP_het / (TP_het + FN_het)
recall_het
## [1] 0.4359504
f1_score_het <- 2 * (
(precision_het * recall_het) / (precision_het + recall_het)
)
f1_score_het
## [1] 0.3958724
The heterogenous model was a combination of ensemble_class_05(LR) and rf_model_3(RF). It produced an F1 SCORE: 0.396 with a recall: 0.436 and
precision: 0.363.
It did not outperform the homogenous logistic regression model that had an F1 SCORE : 0.417.
However the random forest model was weaker at an F1 SCORE: 0.331 compared to the homogenous logistic regression at: 0.417.
It could have weakened the overall performance of the heterogenous model.
save.image(file = "MuchukiM.DA5030.Practicum2-PartA.RData")