LOAD REQUIRED LIBRARIES

library('caret')
## Loading required package: ggplot2
## Loading required package: lattice
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('ggplot2')
library("corrplot")
## corrplot 0.95 loaded
library("randomForest")
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin

DATA LOADING

zip_url <- "https://s3.us-east-2.amazonaws.com/artificium.us/datasets/donations-salted.csv.zip"
zip_file <- tempfile(fileext = ".zip")
download.file(zip_url, zip_file, mode = 'wb')
df <- read.csv(unz(zip_file, "donations-salted.csv"), header = TRUE, sep = ",")
head(df)

DATA INSPECTION

str(df)
## 'data.frame':    19372 obs. of  20 variables:
##  $ CONTROL_NUMBER         : int  5 12 37 38 41 52 53 67 70 71 ...
##  $ DONATED                : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ DONATION_AMT           : int  NA 1000 NA NA NA NA NA 500 NA 1600 ...
##  $ DONOR_AGE              : int  87 79 75 NA 74 63 71 79 41 63 ...
##  $ URBANICITY             : chr  "?" "R" "S" "U" ...
##  $ DONOR_GENDER           : chr  "M" "M" "F" "F" ...
##  $ HOME_OWNER             : chr  "H" "H" "H" "H" ...
##  $ INCOME_LEVEL           : int  2 7 5 6 2 3 5 1 4 4 ...
##  $ SES                    : chr  "." "45" "11" "4" ...
##  $ MEDIAN_HOME_VALUE      : num  554000 334000 2388000 1688000 514000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 8 0 0 0 0 0 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  26 7 6 6 18 19 21 9 17 9 ...
##  $ EMAILS_12              : int  17 28 43 44 25 29 25 33 36 38 ...
##  $ LIFETIME_GIFT_COUNT    : int  35 25 36 37 12 22 20 20 34 22 ...
##  $ LIFETIME_EMAILS        : int  19 32 44 31 30 22 29 33 36 27 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 500 500 300 500 300 500 300 300 300 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  849 1472 1675 1176 883 582 1100 505 503 682 ...
summary(df)
##  CONTROL_NUMBER    DONATED         DONATION_AMT     DONOR_AGE    
##  Min.   :     5   Mode :logical   Min.   :  100   Min.   : 0.00  
##  1st Qu.: 48289   FALSE:14529     1st Qu.: 1000   1st Qu.:47.00  
##  Median : 96937   TRUE :4843      Median : 1300   Median :60.00  
##  Mean   : 96546                   Mean   : 1562   Mean   :58.92  
##  3rd Qu.:145430                   3rd Qu.: 2000   3rd Qu.:73.00  
##  Max.   :191779                   Max.   :20000   Max.   :87.00  
##                                   NA's   :14529   NA's   :4795   
##   URBANICITY        DONOR_GENDER        HOME_OWNER         INCOME_LEVEL  
##  Length:19372       Length:19372       Length:19372       Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.908  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :7.000  
##                                                           NA's   :4392   
##      SES            MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
##  Length:19372       Min.   :      0   Min.   :      0         Min.   :  0.000  
##  Class :character   1st Qu.: 518000   1st Qu.: 232000         1st Qu.:  0.000  
##  Mode  :character   Median : 747000   Median : 311000         Median :  0.000  
##                     Mean   :1079872   Mean   : 341970         Mean   :  3.362  
##                     3rd Qu.:1227000   3rd Qu.: 417000         3rd Qu.:  3.000  
##                     Max.   :6000000   Max.   :1500000         Max.   :241.000  
##                                                                                
##  MONTHS_SINCE_LAST_GIFT   EMAILS_12     LIFETIME_GIFT_COUNT LIFETIME_EMAILS
##  Min.   : 4.00          Min.   : 2.00   Min.   : 1.00       Min.   : 2.00  
##  1st Qu.:16.00          1st Qu.:16.00   1st Qu.: 4.00       1st Qu.:11.00  
##  Median :18.00          Median :18.00   Median : 8.00       Median :18.00  
##  Mean   :18.19          Mean   :18.27   Mean   : 9.98       Mean   :18.67  
##  3rd Qu.:20.00          3rd Qu.:19.00   3rd Qu.:14.00       3rd Qu.:26.00  
##  Max.   :27.00          Max.   :77.00   Max.   :95.00       Max.   :56.00  
##                                                                            
##  LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT LIFETIME_MIN_GIFT_AMT
##  Min.   :  1500       Min.   :   500        Min.   :    0.0      
##  1st Qu.:  4200       1st Qu.:  1200        1st Qu.:  300.0      
##  Median :  7900       Median :  1600        Median :  500.0      
##  Mean   : 10443       Mean   :  1921        Mean   :  762.1      
##  3rd Qu.: 13200       3rd Qu.:  2100        3rd Qu.: 1000.0      
##  Max.   :377500       Max.   :100000        Max.   :45000.0      
##                                                                  
##  LIFETIME_AVG_GIFT_AMT
##  Min.   :  136        
##  1st Qu.:  800        
##  Median : 1120        
##  Mean   : 1286        
##  3rd Qu.: 1500        
##  Max.   :45000        
## 

DATA EXPLORATION

Check for missing values

# check for NA values
colSums(is.na(df))
##          CONTROL_NUMBER                 DONATED            DONATION_AMT 
##                       0                       0                   14529 
##               DONOR_AGE              URBANICITY            DONOR_GENDER 
##                    4795                       0                       0 
##              HOME_OWNER            INCOME_LEVEL                     SES 
##                       0                    4392                       0 
##       MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME       DONATION_RESPONSE 
##                       0                       0                       0 
##  MONTHS_SINCE_LAST_GIFT               EMAILS_12     LIFETIME_GIFT_COUNT 
##                       0                       0                       0 
##         LIFETIME_EMAILS    LIFETIME_GIFT_AMOUNT   LIFETIME_MAX_GIFT_AMT 
##                       0                       0                       0 
##   LIFETIME_MIN_GIFT_AMT   LIFETIME_AVG_GIFT_AMT 
##                       0                       0
sum(is.na(df))
## [1] 23716
# check for empty strings
any(apply(df, 2, function(x) any(x == ""| x == " ")))
## [1] NA
# check for unknown entries in the data frame
colSums(df == "uknown"|df == "Uknown")
##          CONTROL_NUMBER                 DONATED            DONATION_AMT 
##                       0                       0                      NA 
##               DONOR_AGE              URBANICITY            DONOR_GENDER 
##                      NA                       0                       0 
##              HOME_OWNER            INCOME_LEVEL                     SES 
##                       0                      NA                       0 
##       MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME       DONATION_RESPONSE 
##                       0                       0                       0 
##  MONTHS_SINCE_LAST_GIFT               EMAILS_12     LIFETIME_GIFT_COUNT 
##                       0                       0                       0 
##         LIFETIME_EMAILS    LIFETIME_GIFT_AMOUNT   LIFETIME_MAX_GIFT_AMT 
##                       0                       0                       0 
##   LIFETIME_MIN_GIFT_AMT   LIFETIME_AVG_GIFT_AMT 
##                       0                       0

Key findings

Lets investigate columns : INCOME_LEVEL , DONATION_AMT , DONOR_AGE

There are no empty strings in the data frame. Yay!

There are no “Unknown” or “unknown” entries in the data frame. Yay!

# DONATION_AMT
#DONATION_AMT: the monetary amount of the most recent gift
unique(df$DONATION_AMT)
##  [1]    NA  1000   500  1600   300  1200   800  1500  3300  2000  2500  3500
## [13]   200  1100  1800  4000  2100  3200  3000  1400  2400  5000  1250   700
## [25]   900   400   600  2800  1300  1700  2700  3600  2200  2300  3100  1900
## [37]  4800  7500 10000  4700  4400  9500   100 10200 10100  3400  1070  6000
## [49]  4500  2600  1750  1687  3800  4100 20000  3700  1825  2900  5100  4600
## [61]  4200   250  4300  5300   750   525  5500   450  4421  1392 15000
sum(is.na(df$DONATION_AMT))
## [1] 14529
# lets look at the scale of missingness with NA values
14529 / nrow(df) * 100
## [1] 75
# lets look at the rows
df[is.na(df$DONATION_AMT), ]
# lets impute the NA with 0 donation amount
df$DONATION_AMT[is.na(df$DONATION_AMT)] <- 0

# verify the change
sum(is.na(df$DONATION_AMT))
## [1] 0

Key findings

There are 14,529 NA vlaues in the DONATION_AMT representing 75%.

Using the mean would skew the distribution in the data set.

The rows who’s DONATION_AMT is NA have no specific pattern

SOLUTION : Impute the NA with 0 donation amount

# DONOR_AGE 
# DONOR_AGE - the age of donors in years
unique(df$DONOR_AGE)
##  [1] 87 79 75 NA 74 63 71 41 73 81 76 69 45 43 55 77 37 52 30 67 65 59 42 58 56
## [26] 80 82 64 48 85 47 70 40 32 53 86 54 33 60 49  7 17 57 78 50 39 66 51 72 38
## [51] 61 68 24 16 84 44 34 83 35 29 26 27 46 31 62 23 36 18  6 25 28 19 21 22  0
## [76]  2 12 20 15  4 14
sum(is.na(df$DONOR_AGE))
## [1] 4795
# lets look at the rows with NA values
df[is.na(df$DONOR_AGE), ]
table(df$DONOR_AGE)
## 
##   0   2   4   6   7  12  14  15  16  17  18  19  20  21  22  23  24  25  26  27 
##   2   3   1  10  74   1   1   2  11 101   2   5   1  13   9  23  10  46  35 124 
##  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47 
##  37  79  55 111  65 137  80 202 117 282 112 244 160 261 179 278 170 343 178 358 
##  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67 
## 212 404 237 364 177 321 212 361 181 365 186 320 169 264 208 271 178 347 228 411 
##  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87 
## 231 343 225 350 239 395 295 380 281 381 251 326 215 276 192 239 140 174 144 162
summary(df$DONOR_AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   47.00   60.00   58.92   73.00   87.00    4795
# sum of donors who are 18 years and below
sum(df$DONOR_AGE < 18, na.rm = TRUE) 
## [1] 206
# lets use a threshold of 18 and equate all donors <18 to NA for age
df$DONOR_AGE[df$DONOR_AGE <18] <- NA 
# impute the NA with the median age
median_age <- median(df$DONOR_AGE, na.rm = TRUE)
df$DONOR_AGE[is.na(df$DONOR_AGE)] <- median_age
# verify the change
sum(is.na(df$DONOR_AGE))
## [1] 0
table(df$DONOR_AGE)
## 
##   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33 
##    2    5    1   13    9   23   10   46   35  124   37   79   55  111   65  137 
##   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49 
##   80  202  117  282  112  244  160  261  179  278  170  343  178  358  212  404 
##   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65 
##  237  364  177  321  212  361  181  365  186  320 5170  264  208  271  178  347 
##   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81 
##  228  411  231  343  225  350  239  395  295  380  281  381  251  326  215  276 
##   82   83   84   85   86   87 
##  192  239  140  174  144  162

Key findings

There are 4795 NA values in DONOR_AGE

There is a donor at 0 age. I think thats likley an error

ACTION: I set a threshold of 18 and all DONOR-AGE below 18 were set to NA then imputed all NA vlaues with the median age.

# INCOME_LEVEL: one of 7 income levels; higher means more income; exact bracket structure is unknown
unique(df$INCOME_LEVEL)
## [1]  2  7  5  6  3  1  4 NA
table(df$INCOME_LEVEL)
## 
##    1    2    3    4    5    6    7 
## 1822 2626 1699 2526 3149 1591 1567
sum(is.na(df$INCOME_LEVEL))
## [1] 4392
df[is.na(df$INCOME_LEVEL), ]
# check what levsls 1- 7 mean using the donation amount col
tapply(df$DONATION_AMT, df$INCOME_LEVEL, mean, na.rm = TRUE)
##        1        2        3        4        5        6        7 
## 277.0033 331.8751 357.2101 387.3120 425.6812 473.1301 481.0466
# imput the the NA values with the median
median_income <- median(df$INCOME_LEVEL, na.rm = TRUE)
df$INCOME_LEVEL[is.na(df$INCOME_LEVEL)] <- median_income
# verify the change
sum(is.na(df$INCOME_LEVEL))
## [1] 0

Key findings

There are 4392 NA values in INCOME_LEVEL

There is no specific pattern in the rows of the involved na column

I compared INCOME_LEVEL with DONATION_AMT to understand the what levels 1-7 mean. They mean lowest to highest

ACTION: Impute the NA values with the median

# SES: socio-economic status for donor cluster; higher number means a higher socio-economic status
unique(df$SES)
##  [1] "."  "45" "11" "4"  "49" "8"  "50" "28" "30" "43" "53" "42" "46" "20" "16"
## [16] "40" "7"  "34" "23" "35" "41" "25" "10" "1"  "9"  "2"  "12" "14" "37" "36"
## [31] "15" "39" "38" "18" "48" "24" "3"  "13" "31" "5"  "27" "19" "51" "22" "17"
## [46] "26" "21" "44" "6"  "29" "33" "47" "32" "52"
table(unique(df$SES))
## 
##  .  1 10 11 12 13 14 15 16 17 18 19  2 20 21 22 23 24 25 26 27 28 29  3 30 31 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
## 32 33 34 35 36 37 38 39  4 40 41 42 43 44 45 46 47 48 49  5 50 51 52 53  6  7 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
##  8  9 
##  1  1
# compare SES col with DONATION AMT
tapply(df$DONATION_AMT, df$SES, mean, na.rm = TRUE)
##        .        1       10       11       12       13       14       15 
## 630.0022 581.5900 277.5194 441.9421 415.3724 600.6908 468.8326 426.2332 
##       16       17       18       19        2       20       21       22 
## 366.1458 377.9370 420.5977 336.7347 407.7632 445.7413 385.0567 427.6892 
##       23       24       25       26       27       28       29        3 
## 369.1126 426.8553 335.8974 337.6238 378.8288 487.1720 451.7647 524.7500 
##       30       31       32       33       34       35       36       37 
## 253.9499 389.1566 267.7632 288.9908 390.8451 473.0399 350.8380 328.9216 
##       38       39        4       40       41       42       43       44 
## 392.9167 364.4531 398.2301 414.2169 276.9142 416.0458 308.5470 300.3916 
##       45       46       47       48       49        5       50       51 
## 287.1203 406.7751 228.1081 345.0000 321.3333 380.9045 373.0769 278.6957 
##       52       53        6        7        8        9 
## 233.3333 419.8020 308.9431 433.6957 316.4021 294.7712
# impute the "." with the mode of SES column
mode_SES <- as.numeric(sort(table(df$SES), decreasing = TRUE)[1])
df$SES[df$SES == "."] <- mode_SES
# verify the change
table(unique(df$SES))
## 
##   1  10  11  12  13  14  15  16  17  18  19   2  20  21  22  23  24  25  26  27 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  28  29   3  30  31  32  33  34  35  36  37  38  39   4  40  41  42  43  44  45 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  46  47  48  49   5  50  51  52  53   6   7   8 830   9 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1
df$SES <- as.numeric(df$SES)

Key findings

There is a “.” entry in the SES column.

I compared it with the DONATION AMT column. There is no order to the groups of donor clusters hence SES is not ordinal but categorical

ACTION: Impute the “.” with the mode.

# URBANICITY: the “urbanicity” of the donor’s place of residence: 
# U = urban (i.e., lives in a (large) city), 
# S = suburban (outside but near the city), 
# T = town (lives in a “small” town), 
# R = rural (e.g., farm, away from town) , 
# ? = unknown
unique(df$URBANICITY)
## [1] "?" "R" "S" "U" "C" "T"
table(df$URBANICITY, useNA = "always")
## 
##    ?    C    R    S    T    U <NA> 
##  454 4022 4005 4491 3944 2456    0
df$URBANICITY[df$URBANICITY == "?"] <- NA
df$URBANICITY[df$URBANICITY == "C"] <- NA
# verify 
unique(df$URBANICITY)
## [1] NA  "R" "S" "U" "T"
# impute the NA with the mode
mode_URBANICITY <- names(sort(table(df$URBANICITY), decreasing = TRUE)[1])
df$URBANICITY[is.na(df$URBANICITY)] <- mode_URBANICITY
# verify the change
table(df$URBANICITY, useNA = "always")
## 
##    R    S    T    U <NA> 
## 4005 8967 3944 2456    0
proportions <- prop.table(table(df$URBANICITY[!is.na(df$URBANICITY)]))
proportions
## 
##         R         S         T         U 
## 0.2067417 0.4628846 0.2035928 0.1267809
table(is.na(df$URBANICITY))
## 
## FALSE 
## 19372
# change the entries to numerical values
dummies <- dummyVars(~URBANICITY, data = df)
URBANICITY_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
  df %>% select(-URBANICITY), URBANICITY_encoded
)

# verify for any NA 
sum(is.na(df$URBANICITYR))
## [1] 0
sum(is.na(df$URBANICITYS))
## [1] 0
sum(is.na(df$URBANICITYT))
## [1] 0
sum(is.na(df$URBANICITYU))
## [1] 0

Key findings

There is “?” which could be NA

The metadata does not have “C” but the dataset has “C”

ACTION: use one hot encoding to impute values in URBANICITY

# lets explore the binary columns - DONATED , DONOR_GENDER, HOME_OWNER
table(unique(df$DONOR_GENDER))
## 
## F M U 
## 1 1 1
table(unique(df$HOME_OWNER))
## 
## H N U 
## 1 1 1
# DONOR_GENDER 
table(df$DONOR_GENDER)
## 
##    F    M    U 
## 9984 7633 1755
# DONOR-GENEDER is nominal categorical 
# use one hot encoding
dummies <- dummyVars(~DONOR_GENDER, data = df)
DONOR_GENDER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
  df %>% select(-DONOR_GENDER), DONOR_GENDER_encoded
)
# HOME_OWNER: whether donor is a home owner or not: H = yes, U = unknown, N = no
table(df$HOME_OWNER)
## 
##    H    N    U 
## 9456 2151 7765
# use one-hot-encoding
dummies <- dummyVars(~HOME_OWNER, data = df)
HOME_OWNER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
  df%>% select(-HOME_OWNER), HOME_OWNER_encoded
)

# verify the changes
head(df)

Key findings

The DONOR_GENDER has ‘UKNOWN’ which is its own category, hence three categories with no order

The HOME_OWNER is also nominal categorical as it has three categories of no specific order. H = yes, U = unknown, N = no

# check the MEDIAN_HOME_VALUE
table(df$MEDIAN_HOME_VALUE[df$MEDIAN_HOME_VALUE == 0])
## 
##   0 
## 218
# rows with MEDIAN_HOME_VALUE  of zero
zero_MEDIAN_HOME_VALUE  <- which(df$MEDIAN_HOME_VALUE == 0)
print(df[zero_MEDIAN_HOME_VALUE, c(
  "DONOR_AGE",
  "INCOME_LEVEL", 
  "SES",
  "MEDIAN_HOME_VALUE", 
  "MEDIAN_HOUSEHOLD_INCOME",
  "DONATION_RESPONSE",
  "MONTHS_SINCE_LAST_GIFT",
  "LIFETIME_GIFT_COUNT",
  "LIFETIME_GIFT_AMOUNT",
  "LIFETIME_MAX_GIFT_AMT",
  "LIFETIME_MIN_GIFT_AMT",
  "LIFETIME_AVG_GIFT_AMT",
  "EMAILS_12"
)])
##       DONOR_AGE INCOME_LEVEL SES MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME
## 139          60            4  11                 0                       0
## 352          60            7  12                 0                       0
## 363          63            2  21                 0                       0
## 459          60            4  10                 0                  413000
## 674          75            7  11                 0                  237000
## 862          81            1   4                 0                   55000
## 1122         60            4  10                 0                       0
## 1123         60            4  43                 0                       0
## 1177         41            3 830                 0                       0
## 1243         60            4  31                 0                       0
## 1496         86            5  23                 0                  364000
## 1550         60            4 830                 0                       0
## 1551         60            4 830                 0                       0
## 1552         60            4  42                 0                  326000
## 1799         49            4  31                 0                       0
## 2008         73            1  32                 0                       0
## 2208         53            3  36                 0                       0
## 2223         76            2  36                 0                       0
## 2230         61            5  40                 0                       0
## 2357         60            4  39                 0                       0
## 2420         60            4  16                 0                       0
## 2661         60            4  30                 0                       0
## 2713         82            4  40                 0                       0
## 2810         37            7  35                 0                       0
## 2811         60            4  49                 0                       0
## 2825         51            5  35                 0                       0
## 2925         60            4  31                 0                  132000
## 2951         43            5  17                 0                       0
## 2994         60            4  42                 0                       0
## 3176         60            4  30                 0                       0
## 3233         51            3  51                 0                       0
## 3286         60            4  21                 0                       0
## 3400         60            4  14                 0                       0
## 3654         47            6  18                 0                       0
## 3827         60            4   1                 0                   58000
## 3837         60            1  10                 0                   50000
## 3877         60            4  31                 0                       0
## 3890         60            4  19                 0                       0
## 4235         78            2 830                 0                       0
## 4261         75            4  10                 0                       0
## 4271         60            1  26                 0                  344000
## 4410         60            4  10                 0                       0
## 4411         60            4  10                 0                       0
## 4412         60            4  10                 0                       0
## 4413         60            4  10                 0                       0
## 4414         60            4  10                 0                       0
## 4449         60            2  16                 0                       0
## 4556         27            2   9                 0                       0
## 4568         37            2   8                 0                       0
## 4569         60            4   7                 0                       0
## 4604         60            4  33                 0                       0
## 4605         60            4  21                 0                       0
## 4615         74            2  36                 0                       0
## 4766         60            1  31                 0                       0
## 4999         41            4  39                 0                       0
## 5126         51            4  36                 0                       0
## 5290         48            4  40                 0                       0
## 5484         60            4  41                 0                       0
## 5722         41            3  42                 0                       0
## 5872         34            4  49                 0                       0
## 5875         60            4  41                 0                       0
## 6538         80            4  23                 0                  429000
## 7364         39            3  10                 0                       0
## 7570         61            4  36                 0                       0
## 7802         51            7  24                 0                       0
## 8344         55            1   9                 0                  181000
## 8345         86            2   9                 0                  181000
## 8346         60            1   1                 0                       0
## 8348         37            1  10                 0                  118000
## 9278         38            4  10                 0                   54000
## 9280         55            4   1                 0                       0
## 9281         60            4   1                 0                   97000
## 9316         60            4   1                 0                  369000
## 9320         60            4   1                 0                  369000
## 9321         45            1   1                 0                  547000
## 9323         80            5   1                 0                  315000
## 9499         75            5   1                 0                       0
## 10210        60            4  10                 0                       0
## 10224        79            2  31                 0                       0
## 10259        45            2  34                 0                       0
## 10403        55            6  31                 0                       0
## 10575        57            5  13                 0                       0
## 10720        60            4  31                 0                       0
## 10733        53            4  18                 0                       0
## 10834        42            1   9                 0                       0
## 10857        60            3  12                 0                       0
## 10951        61            4  39                 0                       0
## 10994        60            4  32                 0                       0
## 11022        60            4  31                 0                       0
## 11023        57            4  31                 0                       0
## 11085        82            4  36                 0                       0
## 11153        60            4  25                 0                       0
## 11245        60            4  24                 0                       0
## 11258        60            4  15                 0                       0
## 11328        60            4  12                 0                       0
## 11338        37            1  30                 0                       0
## 11386        75            1  41                 0                       0
## 11448        60            4  34                 0                       0
## 11522        70            4  41                 0                       0
## 11610        35            5  24                 0                  291000
## 11618        37            4  21                 0                  272000
## 11739        49            5  16                 0                       0
## 11748        78            4  21                 0                       0
## 11758        60            1  21                 0                       0
## 11759        80            2  21                 0                       0
## 12016        55            2  41                 0                       0
## 12028        60            4  39                 0                       0
## 12054        60            4  31                 0                       0
## 12074        35            3  30                 0                       0
## 12081        78            3  10                 0                       0
## 12245        35            7  10                 0                       0
## 12247        60            4  22                 0                       0
## 12248        60            4  10                 0                       0
## 12295        21            6  35                 0                       0
## 12322        49            6   1                 0                       0
## 12386        42            4  38                 0                       0
## 12554        51            4  39                 0                       0
## 12568        74            2   9                 0                  202000
## 12689        57            3  31                 0                       0
## 12707        60            4  31                 0                       0
## 12848        59            2  31                 0                       0
## 12873        60            4  15                 0                       0
## 12891        37            6  18                 0                       0
## 12949        60            4  21                 0                       0
## 12986        60            4   1                 0                  382000
## 13028        69            1   4                 0                       0
## 13047        60            3  25                 0                   72000
## 13048        64            2   9                 0                  201000
## 13054        76            1  10                 0                  200000
## 13119        49            7  40                 0                       0
## 13259        60            1  31                 0                       0
## 13264        60            3  24                 0                       0
## 13292        29            3  41                 0                       0
## 13368        49            5  26                 0                       0
## 13401        60            4  51                 0                   55000
## 13417        40            5  39                 0                       0
## 13462        75            1  51                 0                       0
## 13713        27            1  29                 0                       0
## 13731        60            1  30                 0                       0
## 13734        85            1  30                 0                       0
## 13970        60            4  41                 0                       0
## 14011        75            4  24                 0                       0
## 14022        60            4  41                 0                       0
## 14041        45            3  44                 0                       0
## 14053        63            5  46                 0                       0
## 14165        55            4   8                 0                       0
## 14167        39            4  21                 0                       0
## 14239        60            4  10                 0                  113000
## 14399        60            4  10                 0                       0
## 14505        60            4   4                 0                  303000
## 14519        75            4   1                 0                       0
## 14557        60            4   6                 0                  258000
## 14578        70            4  33                 0                  178000
## 14921        60            4  34                 0                       0
## 15269        70            2  33                 0                  169000
## 15579        44            4  30                 0                  338000
## 15675        44            4   4                 0                       0
## 15773        52            4 830                 0                       0
## 15827        79            7  35                 0                       0
## 15831        60            4   1                 0                  333000
## 15843        39            4  17                 0                  314000
## 16025        61            5  20                 0                       0
## 16338        85            1  10                 0                  132000
## 16339        60            4  10                 0                  132000
## 16342        60            4  10                 0                       0
## 16345        43            1   9                 0                  192000
## 16418        51            4  15                 0                       0
## 16501        50            5  24                 0                       0
## 16676        60            4  10                 0                   58000
## 16691        47            4  10                 0                  112000
## 16698        60            4  25                 0                   74000
## 16772        60            4   9                 0                  263000
## 16842        60            4  34                 0                       0
## 16981        63            5  10                 0                       0
## 16982        60            4  21                 0                       0
## 17303        60            1  25                 0                  207000
## 17335        60            4 830                 0                       0
## 17459        60            4  10                 0                       0
## 17460        60            4  10                 0                       0
## 17461        60            4  10                 0                       0
## 17462        60            4  10                 0                       0
## 17463        60            4  10                 0                       0
## 17464        60            4  10                 0                       0
## 17465        60            4  10                 0                       0
## 17466        60            4  10                 0                       0
## 17467        60            4  10                 0                       0
## 17468        60            4 830                 0                       0
## 17469        60            4 830                 0                       0
## 17520        60            4  41                 0                  293000
## 17539        60            4 830                 0                       0
## 17540        60            4 830                 0                       0
## 17596        61            5  31                 0                       0
## 17666        60            4  16                 0                   65000
## 17710        55            3   8                 0                       0
## 17711        41            5  40                 0                       0
## 17721        60            5  25                 0                       0
## 17876        43            5  37                 0                       0
## 17880        49            3  27                 0                       0
## 17908        41            7  42                 0                       0
## 17953        60            4  15                 0                       0
## 17968        60            4  21                 0                       0
## 17974        25            1  18                 0                       0
## 18025        60            5  40                 0                       0
## 18068        60            4  10                 0                  102000
## 18087        35            4  10                 0                       0
## 18290        49            3  27                 0                       0
## 18337        60            1  10                 0                       0
## 18375        60            4  39                 0                  210000
## 18382        34            4  36                 0                       0
## 18491        60            2  30                 0                       0
## 18495        60            4  40                 0                       0
## 18894        60            4  10                 0                       0
## 18913        60            4  21                 0                       0
## 19040        60            4  10                 0                       0
## 19159        60            4 830                 0                       0
## 19306        60            2  51                 0                       0
## 19339        56            4 830                 0                       0
## 19344        79            5 830                 0                       0
##       DONATION_RESPONSE MONTHS_SINCE_LAST_GIFT LIFETIME_GIFT_COUNT
## 139                   0                     18                   6
## 352                   6                      4                  13
## 363                   0                     15                   9
## 459                   0                     18                   6
## 674                   0                     16                  15
## 862                   3                     18                   8
## 1122                  0                     17                  53
## 1123                  0                     18                  12
## 1177                  0                     22                  30
## 1243                  0                     21                   8
## 1496                  0                     15                  11
## 1550                  0                     15                   3
## 1551                  0                     21                   4
## 1552                  0                     18                   3
## 1799                  0                     23                   1
## 2008                  0                     16                   7
## 2208                  0                     18                  16
## 2223                 42                     17                  13
## 2230                  0                     21                  13
## 2357                  0                     15                  11
## 2420                  0                     20                   1
## 2661                  0                     17                   7
## 2713                  0                     15                  14
## 2810                  7                     23                   5
## 2811                  0                     17                   4
## 2825                  0                     21                  11
## 2925                  0                     16                   2
## 2951                  0                     17                   1
## 2994                  0                     25                  21
## 3176                  0                     26                  37
## 3233                  0                     23                   2
## 3286                  0                     15                  12
## 3400                  0                     19                   2
## 3654                  0                     18                   4
## 3827                  0                     18                  19
## 3837                  0                     18                   5
## 3877                  0                     17                  24
## 3890                  0                     15                   7
## 4235                  0                     15                   5
## 4261                  0                     16                  25
## 4271                  0                     22                   9
## 4410                  0                     24                  13
## 4411                  0                     18                  23
## 4412                  0                     24                  11
## 4413                  0                     24                   4
## 4414                  0                     18                   4
## 4449                  9                     21                   1
## 4556                  0                     25                  16
## 4568                  1                     18                  24
## 4569                  0                     15                   5
## 4604                  0                     18                  19
## 4605                  0                     16                   2
## 4615                  0                     17                  21
## 4766                  0                     18                   4
## 4999                  0                     18                   8
## 5126                  0                     17                   9
## 5290                  0                     18                  17
## 5484                  0                     16                   6
## 5722                  0                     26                  18
## 5872                  0                     24                   2
## 5875                  0                     16                   6
## 6538                  0                     20                   1
## 7364                  0                     25                  18
## 7570                  0                     16                  12
## 7802                  0                     21                  10
## 8344                  0                     18                   8
## 8345                  0                     15                   6
## 8346                  0                     18                   5
## 8348                  1                     17                   1
## 9278                  0                     18                   7
## 9280                  0                     17                   6
## 9281                  0                     20                   1
## 9316                  0                     18                   5
## 9320                  0                     20                  21
## 9321                  0                     17                  12
## 9323                 15                     19                  16
## 9499                  0                     20                   3
## 10210                 0                     21                   2
## 10224                 0                     16                  10
## 10259                 0                     23                  12
## 10403                 0                     20                   1
## 10575                 6                     17                   1
## 10720                 0                     17                   9
## 10733                 0                     17                  10
## 10834                 0                     16                   1
## 10857                 0                     19                   2
## 10951                 0                     19                   6
## 10994                 0                     15                   4
## 11022                 0                     21                   8
## 11023                 0                     20                   6
## 11085                 0                     20                  10
## 11153                 0                     20                   2
## 11245                 0                     19                   1
## 11258                 0                     15                   5
## 11328                 0                     17                   1
## 11338                 0                     26                   4
## 11386                 0                     17                  34
## 11448                 0                     18                   4
## 11522                 0                     18                  21
## 11610                 5                     23                   2
## 11618                 0                     20                   1
## 11739                 1                     16                   1
## 11748                 0                     26                  21
## 11758                 0                     15                  10
## 11759                 0                     16                  16
## 12016                 0                     18                   6
## 12028                 0                     21                   1
## 12054                 0                     18                  10
## 12074                 0                     15                  13
## 12081                12                     21                   2
## 12245                 0                     16                   6
## 12247                 0                     17                  27
## 12248                 0                     21                   5
## 12295                 7                     24                  16
## 12322                 0                     15                  24
## 12386                 0                     18                  22
## 12554                 0                     17                   1
## 12568                 0                     16                  17
## 12689                 0                     16                  12
## 12707                 0                     22                  10
## 12848                 0                     19                   5
## 12873                 0                     24                  14
## 12891                 6                     16                  15
## 12949                 0                     20                   1
## 12986                 0                     20                   3
## 13028                 0                     18                  12
## 13047                17                     20                   3
## 13048                 0                     19                  16
## 13054                12                     18                  15
## 13119                 0                     25                  11
## 13259                 0                     15                   4
## 13264                 0                     24                  34
## 13292                 0                     18                   4
## 13368                 0                     20                   1
## 13401                 0                     17                  18
## 13417                 2                     15                  11
## 13462                 0                     21                   2
## 13713                 0                     24                  14
## 13731                 2                     18                  21
## 13734                18                     16                   6
## 13970                 0                     22                   2
## 14011                 0                     18                   5
## 14022                 0                     15                  18
## 14041                 0                     18                   2
## 14053                 0                     16                  22
## 14165                 0                     26                  17
## 14167                 0                     24                   8
## 14239                 0                     16                   1
## 14399                 0                     23                   2
## 14505                 0                     15                  15
## 14519                 0                     23                   1
## 14557                 0                     22                   1
## 14578                 7                     19                   3
## 14921                 0                     24                   5
## 15269                 5                     15                   7
## 15579                 1                     18                   6
## 15675                 0                     16                   3
## 15773                 0                     17                   4
## 15827                 0                     18                   3
## 15831                 0                     21                   1
## 15843                 0                     20                   4
## 16025                 0                     16                   8
## 16338                 1                     18                  22
## 16339                 0                     16                   2
## 16342                 0                     18                  12
## 16345                 4                     21                   2
## 16418                 3                     15                  15
## 16501                17                     18                   2
## 16676                 0                     15                   3
## 16691                 0                     18                  16
## 16698                 0                     25                   2
## 16772                 0                     21                   1
## 16842                 0                     19                   6
## 16981                 0                     18                  28
## 16982                 0                     18                   7
## 17303                 0                     18                  16
## 17335                 0                     16                  20
## 17459                 0                     18                   6
## 17460                 0                     22                   8
## 17461                 0                     15                  20
## 17462                 0                     15                   4
## 17463                 0                     22                   8
## 17464                 0                     17                   4
## 17465                 0                     17                   5
## 17466                 0                     18                   7
## 17467                 0                     23                   3
## 17468                 0                     16                  36
## 17469                 0                     20                   3
## 17520                 0                     16                   1
## 17539                 0                     16                   9
## 17540                 0                     17                  14
## 17596                 0                     20                  16
## 17666                 0                     22                   8
## 17710                 0                     18                   6
## 17711                 0                     16                   3
## 17721                 1                     18                  16
## 17876                 0                     19                   1
## 17880                 1                     20                   5
## 17908                 0                     19                   4
## 17953                 0                     22                  16
## 17968                 0                     21                   1
## 17974                 0                     17                   5
## 18025                 0                     17                   1
## 18068                 0                     18                   5
## 18087                 0                     19                  16
## 18290                 0                     24                   3
## 18337                 0                     20                   5
## 18375                 0                     15                  12
## 18382                 0                     17                  12
## 18491                 4                     24                   2
## 18495                 0                     16                   7
## 18894                 0                     16                   3
## 18913                 0                     20                   2
## 19040                 0                      5                  23
## 19159                 0                     16                  11
## 19306                 0                     21                   1
## 19339                 0                     18                  10
## 19344                 0                     21                   2
##       LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT LIFETIME_MIN_GIFT_AMT
## 139                  15500                  3000                  2000
## 352                  21300                  2500                  1000
## 363                   6900                  1000                   500
## 459                  33700                 10000                  2500
## 674                  20400                  5000                   500
## 862                  11859                  2500                   359
## 1122                 49800                  1500                   500
## 1123                  9500                  1500                   500
## 1177                 31300                  2500                   300
## 1243                 13600                  3000                   500
## 1496                 18000                  2500                   500
## 1550                  2200                  1100                   500
## 1551                  3700                  1700                   500
## 1552                  6000                  2000                  2000
## 1799                  2500                  2500                  2500
## 2008                  8600                  1800                   500
## 2208                 37000                  5000                   500
## 2223                  6800                  1000                   100
## 2230                 12600                  1300                   500
## 2357                  6600                  2000                   200
## 2420                  2000                  2000                  2000
## 2661                  4500                  1000                   400
## 2713                 17400                  2100                   300
## 2810                  7900                  2500                   300
## 2811                  6100                  2500                   600
## 2825                 17200                  5000                   300
## 2925                  2000                  1500                   500
## 2951                  2000                  2000                  2000
## 2994                 13200                  1000                   300
## 3176                 34900                  1500                   200
## 3233                  6500                  4000                  2500
## 3286                  7600                  1000                   300
## 3400                  1900                  1500                   400
## 3654                  8000                  2500                  1500
## 3827                 10900                  1000                   500
## 3837                  3000                  1000                   200
## 3877                 25800                  2200                   200
## 3890                 23100                  5000                  2500
## 4235                  3700                  1000                   500
## 4261                 11500                   900                   300
## 4271                 10000                  2000                   200
## 4410                 19400                  2600                   500
## 4411                 12000                   800                   200
## 4412                 27500                  6000                   500
## 4413                  3800                  1500                   500
## 4414                  4700                  1500                  1000
## 4449                  1500                  1500                  1500
## 4556                 10800                  1000                   300
## 4568                  8500                  1000                   200
## 4569                  5300                  1500                   500
## 4604                 14100                  1500                   300
## 4605                  2500                  1500                  1000
## 4615                 25900                  2000                   700
## 4766                  5900                  2000                   500
## 4999                  7600                  1500                   500
## 5126                 10000                  1500                   500
## 5290                 11800                  1500                   200
## 5484                  5500                  2000                   500
## 5722                 15100                  1200                   300
## 5872                  4000                  2000                  2000
## 5875                  3200                   700                   300
## 6538                  3000                  3000                  3000
## 7364                 15800                  1300                   500
## 7570                  5000                   700                   200
## 7802                  9700                  1500                   500
## 8344                  4500                  1000                   500
## 8345                  8000                  1500                  1000
## 8346                  5400                  1400                   600
## 8348                  2000                  2000                  2000
## 9278                 13500                  3000                  1000
## 9280                  6800                  2000                   500
## 9281                  2500                  2500                  2500
## 9316                  6100                  1300                  1000
## 9320                 19800                  1900                   300
## 9321                 13900                  2000                   500
## 9323                  8700                  1100                   200
## 9499                  3200                  1500                   500
## 10210                 2500                  1500                  1000
## 10224                 4800                   500                   300
## 10259                 7200                  1000                   500
## 10403                 1500                  1500                  1500
## 10575                 1500                  1500                  1500
## 10720                15000                  3000                  1000
## 10733                38000                 10000                   500
## 10834                 2500                  2500                  2500
## 10857                 3700                  2200                  1500
## 10951                 3400                  1000                   400
## 10994                 3300                  1000                   500
## 11022                 7577                  1577                   300
## 11023                 7200                  2000                   300
## 11085                12400                  1800                   500
## 11153                 3500                  2000                  1500
## 11245                 2500                  2500                  2500
## 11258                 3200                  1000                   500
## 11328                 1500                  1500                  1500
## 11338                 9400                  2700                  2000
## 11386                11450                  1500                   150
## 11448                 6200                  2200                  1000
## 11522                20600                  1300                   300
## 11610                 2600                  1600                  1000
## 11618                 5000                  5000                  5000
## 11739                 1500                  1500                  1500
## 11748                21700                  1500                   300
## 11758                10400                  1500                   500
## 11759                16300                  1700                   200
## 12016                 6600                  2000                   500
## 12028                 2500                  2500                  2500
## 12054                19400                  2500                   500
## 12074                11600                  1500                   500
## 12081                 4000                  2500                  1500
## 12245                20300                  5200                  2100
## 12247                20200                  1100                   300
## 12248                 6100                  2000                   500
## 12295                14300                  2000                   300
## 12322                17100                  1500                   300
## 12386                 7200                   500                   200
## 12554                 1500                  1500                  1500
## 12568                15350                  1500                   250
## 12689                12600                  1500                   500
## 12707                10300                  1700                   500
## 12848                 6300                  2000                   500
## 12873                23500                  2500                   300
## 12891                12400                  1100                   500
## 12949                 2000                  2000                  2000
## 12986                 4000                  2500                   500
## 13028                17900                  2500                   100
## 13047                 5500                  3000                   500
## 13048                67300                  5100                  2500
## 13054                14600                  1500                   500
## 13119                20000                  2500                  1000
## 13259                 4100                  1500                   500
## 13264                18500                  1200                   300
## 13292                 8000                  3000                  1000
## 13368                 2000                  2000                  2000
## 13401                 8200                   600                   300
## 13417                12600                  2000                   500
## 13462                 3200                  1700                  1500
## 13713                15700                  2600                   300
## 13731                 7000                   500                   100
## 13734                10500                  2500                  1000
## 13970                 3000                  2000                  1000
## 14011                 3600                  1000                   500
## 14022                39100                  5000                   600
## 14041                 5000                  3000                  2000
## 14053                27900                  2500                   300
## 14165                16400                  2500                   500
## 14167                12500                  2000                  1000
## 14239                 2000                  2000                  2000
## 14399                 1800                  1500                   300
## 14505                 8200                   800                   300
## 14519                 2500                  2500                  2500
## 14557                 1500                  1500                  1500
## 14578                 5700                  2100                  1500
## 14921                 6600                  2000                   600
## 15269                29300                  8000                   500
## 15579                 8500                  2000                  1000
## 15675                 4500                  2500                   500
## 15773                 9000                  2500                  2000
## 15827                 5000                  2500                  1000
## 15831                 2000                  2000                  2000
## 15843                 4700                  1500                  1000
## 16025                 4200                  1200                   300
## 16338                 9900                   600                   300
## 16339                 2600                  1600                  1000
## 16342                10900                  1500                   300
## 16345                 2500                  1500                  1000
## 16418                11200                  1500                   500
## 16501                 2500                  2000                   500
## 16676                 2000                  1000                   500
## 16691                20900                  2500                   500
## 16698                 4500                  2500                  2000
## 16772                 2000                  2000                  2000
## 16842                 5300                  1500                   300
## 16981                16100                  1000                   300
## 16982                 7800                  1700                   200
## 17303                21800                  3000                   300
## 17335                20500                  1500                   300
## 17459                12300                  5000                   600
## 17460                 9400                  1500                   500
## 17461                 9800                  1200                   300
## 17462                 4400                  1500                   500
## 17463                10500                  2700                   300
## 17464                 6000                  2000                  1000
## 17465                 8200                  2100                  1000
## 17466                 8200                  3000                   500
## 17467                 6000                  2500                  1500
## 17468                52200                  3500                   300
## 17469                 3300                  1700                   500
## 17520                 1500                  1500                  1500
## 17539                 6700                  2000                   300
## 17540                 6100                   600                   300
## 17596                29900                  2700                  1000
## 17666                20700                  3500                   500
## 17710                 5400                  1200                   500
## 17711                 4700                  1700                  1500
## 17721                13300                  1800                   300
## 17876                 1500                  1500                  1500
## 17880                 5400                  1600                   500
## 17908                 3800                  1500                   500
## 17953                 8600                   700                   300
## 17968                 2000                  2000                  2000
## 17974                 4400                  1000                   500
## 18025                 2500                  2500                  2500
## 18068                 4900                  1500                   200
## 18087                14900                  1500                   500
## 18290                 5000                  2000                  1000
## 18337                 7500                  2500                   500
## 18375                 5100                   600                   300
## 18382                 7500                  1000                   200
## 18491                 3000                  2000                  1000
## 18495                12000                  2500                   500
## 18894                 3200                  1200                  1000
## 18913                 3000                  2000                  1000
## 19040                46600                  5000                   300
## 19159                 6300                   700                   500
## 19306                 2000                  2000                  2000
## 19339                14500                  2500                  1000
## 19344                 3000                  2500                   500
##       LIFETIME_AVG_GIFT_AMT EMAILS_12
## 139                    2583        37
## 352                    1638        37
## 363                     767        36
## 459                    5617        24
## 674                    1360        22
## 862                    1482        22
## 1122                    940        29
## 1123                    792        15
## 1177                   1043        24
## 1243                   1700        33
## 1496                   1636        21
## 1550                    733        17
## 1551                    925        11
## 1552                   2000         9
## 1799                   2500        13
## 2008                   1229        21
## 2208                   2313        19
## 2223                    523        20
## 2230                    969        19
## 2357                    600        21
## 2420                   2000        15
## 2661                    643        19
## 2713                   1243        20
## 2810                   1580        19
## 2811                   1525        19
## 2825                   1564        19
## 2925                   1000        15
## 2951                   2000        14
## 2994                    629        16
## 3176                    943        10
## 3233                   3250        19
## 3286                    633        20
## 3400                    950        17
## 3654                   2000        19
## 3827                    574        18
## 3837                    600        18
## 3877                   1075        20
## 3890                   3300        19
## 4235                    740        20
## 4261                    460        20
## 4271                   1111        18
## 4410                   1492        18
## 4411                    522        16
## 4412                   2500        19
## 4413                    950        16
## 4414                   1175        16
## 4449                   1500        11
## 4556                    675        16
## 4568                    354        19
## 4569                   1060        16
## 4604                    742        19
## 4605                   1250        15
## 4615                   1233        14
## 4766                   1475        19
## 4999                    950        18
## 5126                   1111        15
## 5290                    694        19
## 5484                    917        20
## 5722                    839        13
## 5872                   2000        13
## 5875                    533        19
## 6538                   3000        16
## 7364                    878        17
## 7570                    417        12
## 7802                    970        17
## 8344                    563        18
## 8345                   1333        13
## 8346                   1080        20
## 8348                   2000        14
## 9278                   1929        19
## 9280                   1133        19
## 9281                   2500        16
## 9316                   1220        18
## 9320                    943        19
## 9321                   1158        19
## 9323                    544        15
## 9499                   1067        20
## 10210                  1250        14
## 10224                   480        20
## 10259                   600        16
## 10403                  1500        11
## 10575                  1500        14
## 10720                  1667        19
## 10733                  3800        21
## 10834                  2500        17
## 10857                  1850        16
## 10951                   567        19
## 10994                   825        15
## 11022                   947        19
## 11023                  1200        15
## 11085                  1240        19
## 11153                  1750        16
## 11245                  2500        14
## 11258                   640        18
## 11328                  1500        14
## 11338                  2350        17
## 11386                   337        11
## 11448                  1550        19
## 11522                   981        17
## 11610                  1300        14
## 11618                  5000        16
## 11739                  1500        14
## 11748                  1033        15
## 11758                  1040        19
## 11759                  1019        19
## 12016                  1100        19
## 12028                  2500        17
## 12054                  1940        19
## 12074                   892        17
## 12081                  2000        19
## 12245                  3383        21
## 12247                   748        17
## 12248                  1220        18
## 12295                   894        18
## 12322                   713        19
## 12386                   327        19
## 12554                  1500        15
## 12568                   903        19
## 12689                  1050        18
## 12707                  1030        16
## 12848                  1260        19
## 12873                  1679        17
## 12891                   827        21
## 12949                  2000        14
## 12986                  1333        20
## 13028                  1492        19
## 13047                  1833        20
## 13048                  4206        20
## 13054                   973        19
## 13119                  1818        19
## 13259                  1025        19
## 13264                   544        19
## 13292                  2000        19
## 13368                  2000        14
## 13401                   456        20
## 13417                  1145        21
## 13462                  1600        14
## 13713                  1121        18
## 13731                   333        18
## 13734                  1750        19
## 13970                  1500        14
## 14011                   720        20
## 14022                  2172        20
## 14041                  2500        19
## 14053                  1268        17
## 14165                   965        15
## 14167                  1563        17
## 14239                  2000        16
## 14399                   900        14
## 14505                   547        13
## 14519                  2500        14
## 14557                  1500        11
## 14578                  1900        18
## 14921                  1320        18
## 15269                  4186        23
## 15579                  1417        19
## 15675                  1500        20
## 15773                  2250        21
## 15827                  1667        18
## 15831                  2000        14
## 15843                  1175        14
## 16025                   525        19
## 16338                   450        19
## 16339                  1300        17
## 16342                   908        19
## 16345                  1250        19
## 16418                   747        15
## 16501                  1250        16
## 16676                   667        17
## 16691                  1306        19
## 16698                  2250        18
## 16772                  2000        14
## 16842                   883        19
## 16981                   575        18
## 16982                  1114        17
## 17303                  1363        19
## 17335                  1025        20
## 17459                  2050        20
## 17460                  1175        18
## 17461                   490        19
## 17462                  1100        18
## 17463                  1313        19
## 17464                  1500        18
## 17465                  1640        19
## 17466                  1171        19
## 17467                  2000        20
## 17468                  1450        20
## 17469                  1100        16
## 17520                  1500        14
## 17539                   744        17
## 17540                   436        20
## 17596                  1869        19
## 17666                  2588        19
## 17710                   900        19
## 17711                  1567        20
## 17721                   831        12
## 17876                  1500        14
## 17880                  1080        21
## 17908                   950        16
## 17953                   538        18
## 17968                  2000        16
## 17974                   880        20
## 18025                  2500        16
## 18068                   980        14
## 18087                   931        19
## 18290                  1667        17
## 18337                  1500        19
## 18375                   425        20
## 18382                   625        22
## 18491                  1500        14
## 18495                  1714        13
## 18894                  1067         6
## 18913                  1500         7
## 19040                  2026        44
## 19159                   573         6
## 19306                  2000        16
## 19339                  1450        19
## 19344                  1500        19
new_df <- df

Key findings

There is 0 entries for median home value. This is likely a data entry error because home value cannot be zero

ACTION: Observe for now, deletion may affect the model.

# confirm all changes
table(new_df$MEDIAN_HOME_VALUE[new_df$MEDIAN_HOME_VALUE == 0])
## 
##   0 
## 218
str(new_df)
## 'data.frame':    19372 obs. of  27 variables:
##  $ CONTROL_NUMBER         : int  5 12 37 38 41 52 53 67 70 71 ...
##  $ DONATED                : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ DONATION_AMT           : num  0 1000 0 0 0 0 0 500 0 1600 ...
##  $ DONOR_AGE              : int  87 79 75 60 74 63 71 79 41 63 ...
##  $ INCOME_LEVEL           : num  2 7 5 6 2 3 5 1 4 4 ...
##  $ SES                    : num  830 45 11 4 49 8 50 28 30 43 ...
##  $ MEDIAN_HOME_VALUE      : num  554000 334000 2388000 1688000 514000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 8 0 0 0 0 0 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  26 7 6 6 18 19 21 9 17 9 ...
##  $ EMAILS_12              : int  17 28 43 44 25 29 25 33 36 38 ...
##  $ LIFETIME_GIFT_COUNT    : int  35 25 36 37 12 22 20 20 34 22 ...
##  $ LIFETIME_EMAILS        : int  19 32 44 31 30 22 29 33 36 27 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 500 500 300 500 300 500 300 300 300 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  849 1472 1675 1176 883 582 1100 505 503 682 ...
##  $ URBANICITYR            : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ URBANICITYS            : num  1 0 1 0 0 0 0 1 1 0 ...
##  $ URBANICITYT            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ URBANICITYU            : num  0 0 0 1 0 1 0 0 0 0 ...
##  $ DONOR_GENDERF          : num  0 0 1 1 1 0 0 1 1 1 ...
##  $ DONOR_GENDERM          : num  1 1 0 0 0 1 1 0 0 0 ...
##  $ DONOR_GENDERU          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOME_OWNERH            : num  1 1 1 1 0 0 1 1 1 1 ...
##  $ HOME_OWNERN            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOME_OWNERU            : num  0 0 0 0 1 1 0 0 0 0 ...
# Convert target variable to factor
new_df$DONATED <- as.factor(new_df$DONATED)
# reference category
new_df$DONOR_GENDERU <- NULL
new_df$HOME_OWNERU <- NULL
new_df$URBANICITYU <- NULL

VISUALIZATIONS

# distribution of target variable
ggplot(new_df, aes(x = DONATED)) +
  geom_bar(fill = "steelblue") +
  labs(title="Distribution of donors who donated") +
  theme(axis.text = element_text(angle = 45, vjust = 0.5, hjust = 1))

#### Key findings

A high percentage of donors did not make a donation to the latest campaign. This may affect the model prediction.

# create a correlation plot for the numerical variables
numeric_cols <- new_df %>%
  select(c("DONOR_AGE", 
           "INCOME_LEVEL", 
           "SES",
           "MEDIAN_HOME_VALUE", 
           "MEDIAN_HOUSEHOLD_INCOME",
           "DONATION_RESPONSE",
           "MONTHS_SINCE_LAST_GIFT",
           "LIFETIME_GIFT_COUNT",
           "LIFETIME_GIFT_AMOUNT",
           "LIFETIME_MAX_GIFT_AMT",
           "LIFETIME_MIN_GIFT_AMT",
           "LIFETIME_AVG_GIFT_AMT",
           "EMAILS_12"
           ))
par(mar = c(2,2,2,2)) # expand margins because the plot is small
# visualize the corr plot
corrplot(cor(numeric_cols),
         method = 'color',
         type = 'upper',
         addCoef.col = 'black',
         tl.srt = 45,
         tl.col = "black",
         number.cex = 0.55
         )

#### Key findings high correlation between MEDIAN_HOME_VALUE and MEDIAN_HOUSEHOLD_INCOME - 0.68 is expected as a higher income is likely to live in a high median home value.

high correlation between LIFETIME_GIFT_COUNT and LIFETIME_GIFT_AMOUNT - 0.65 - the more the gifts the more the counts

high correlation between LIFETIME_MAX_GIFT_AMT and LIFETIME_MIN_GIFT_AMT - 0.73

high correlation between LIFETIME_AVG_GIFT_AMT and LIFETIME_MIN_GIFT_AMT - 0.80 - this is very high, could they be measuring the same thing?

# investigate the relation between the target variable and numerical variables
# for the firs 6 numerical variables
# reset the margins
par(mar = c(4,4,2,1)) # the grid is made of 2 rows and 4 columns

par(mfrow = c(2,4)) # multiple figures filled row by row in the set 2 rows bt 4 columns. 
numeric_cols <- new_df %>%
  select(c("DONOR_AGE", 
           "INCOME_LEVEL", 
           "SES",
           "MEDIAN_HOME_VALUE", 
           "MEDIAN_HOUSEHOLD_INCOME",
           "DONATION_RESPONSE"
           ))


for (variable in names(numeric_cols)) {
  boxplot(new_df[[variable]] ~ new_df$DONATED,
          main = variable,
          xlab = "Donated",
          ylab = variable,
          col = c("orange", "steelblue"),
          outline = FALSE) # FALSE to not show the outliers
  }

# investigate the relation between the target variable and numerical variables
# for the second 6 numerical variables
# reset the margins
par(mar = c(4,4,2,1))

par(mfrow = c(2,4))
numeric_cols <- df %>%
  select(c("MONTHS_SINCE_LAST_GIFT",
           "LIFETIME_GIFT_COUNT",
           "LIFETIME_GIFT_AMOUNT",
           "LIFETIME_MAX_GIFT_AMT",
           "LIFETIME_MIN_GIFT_AMT",
           "LIFETIME_AVG_GIFT_AMT",
           "EMAILS_12"
           ))

for (variable in names(numeric_cols)) {
  boxplot(new_df[[variable]] ~ new_df$DONATED,
          main = variable,
          xlab = "Donated",
          ylab = variable,
          col = c("orange", "steelblue"),
          outline = FALSE) # FALSE to not show the outliers
  }

#### Key findings

DONOR_AGE, INCOME_LEVEL, SES are a weak predictor as it does not separate those who donated from those who did not donate.

MEDIAN_HOME_VALUE, MEDIAN_HOUSEHOLD_INCOME and DONATION_RESPONSE have the blue box slightly higher than the orange which means that those who donated are likely to have a higher median value home,

higher median household income and are likely more responsive. These are strong predictors for the model.

Strong predictors:

MONTHS_SINCE_LAST_GIFT shows that who didnt donate have more months since their last gift.

LIFETIME_MAX_GIFT_AMT,LIFETIME_AVG_GIFT_AMT shows a difference with those who failed to donate as higher than those who didn’t.

LIFETIME_GIFT_AMOUNT shows those who have given more gifts were slightly higher than those who didn’t.

EMAILS_12 shows that those who receieved more campaignm emails were more likley to give compared to those who didnt.

Weak predictors:

LIFETIME_MIN_GIFT_AMT shows no difference between donors and non-donors

new_df$DONATION_AMT <- NULL
new_df$CONTROL_NUMBER <- NULL
# last check before splitting the data
str(new_df)
## 'data.frame':    19372 obs. of  22 variables:
##  $ DONATED                : Factor w/ 2 levels "FALSE","TRUE": 1 2 1 1 1 1 1 2 1 2 ...
##  $ DONOR_AGE              : int  87 79 75 60 74 63 71 79 41 63 ...
##  $ INCOME_LEVEL           : num  2 7 5 6 2 3 5 1 4 4 ...
##  $ SES                    : num  830 45 11 4 49 8 50 28 30 43 ...
##  $ MEDIAN_HOME_VALUE      : num  554000 334000 2388000 1688000 514000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 8 0 0 0 0 0 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  26 7 6 6 18 19 21 9 17 9 ...
##  $ EMAILS_12              : int  17 28 43 44 25 29 25 33 36 38 ...
##  $ LIFETIME_GIFT_COUNT    : int  35 25 36 37 12 22 20 20 34 22 ...
##  $ LIFETIME_EMAILS        : int  19 32 44 31 30 22 29 33 36 27 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 500 500 300 500 300 500 300 300 300 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  849 1472 1675 1176 883 582 1100 505 503 682 ...
##  $ URBANICITYR            : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ URBANICITYS            : num  1 0 1 0 0 0 0 1 1 0 ...
##  $ URBANICITYT            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DONOR_GENDERF          : num  0 0 1 1 1 0 0 1 1 1 ...
##  $ DONOR_GENDERM          : num  1 1 0 0 0 1 1 0 0 0 ...
##  $ HOME_OWNERH            : num  1 1 1 1 0 0 1 1 1 1 ...
##  $ HOME_OWNERN            : num  0 0 0 0 0 0 0 0 0 0 ...

Split the data for logistic regression

set.seed(123)
# create an index
train_index <- createDataPartition(new_df$DONATED, p = 0.8, list = FALSE)
# Split index into train and test
train_data <- new_df[train_index, ]
test_data <- new_df[-train_index, ]
# check the total observations
total_observations <- sum(nrow(train_data) + nrow(test_data))

# verify
nrow(train_data)
## [1] 15499
nrow(test_data)
## [1] 3873
total_observations
## [1] 19372
prop.table(table(train_data$DONATED))
## 
##     FALSE      TRUE 
## 0.7499839 0.2500161
prop.table(table(test_data$DONATED))
## 
##     FALSE      TRUE 
## 0.7500645 0.2499355

Build a logistic model that includes all the features

model <- glm(DONATED~.,family = "binomial", data = train_data)
summary(model)
## 
## Call:
## glm(formula = DONATED ~ ., family = "binomial", data = train_data)
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -7.417e-01  2.064e-01  -3.593 0.000326 ***
## DONOR_AGE                2.360e-03  1.460e-03   1.616 0.106076    
## INCOME_LEVEL             5.372e-02  1.424e-02   3.772 0.000162 ***
## SES                      2.562e-04  1.459e-04   1.756 0.079074 .  
## MEDIAN_HOME_VALUE        1.475e-07  2.713e-08   5.436 5.45e-08 ***
## MEDIAN_HOUSEHOLD_INCOME -8.287e-08  1.636e-07  -0.507 0.612353    
## DONATION_RESPONSE        1.069e-03  1.935e-03   0.552 0.580668    
## MONTHS_SINCE_LAST_GIFT  -4.196e-02  5.529e-03  -7.588 3.24e-14 ***
## EMAILS_12               -5.969e-03  4.179e-03  -1.428 0.153212    
## LIFETIME_GIFT_COUNT      1.661e-02  4.637e-03   3.583 0.000340 ***
## LIFETIME_EMAILS         -4.447e-03  3.893e-03  -1.142 0.253301    
## LIFETIME_GIFT_AMOUNT     3.352e-06  3.812e-06   0.879 0.379213    
## LIFETIME_MAX_GIFT_AMT    9.168e-06  1.857e-05   0.494 0.621580    
## LIFETIME_MIN_GIFT_AMT    3.272e-05  6.281e-05   0.521 0.602401    
## LIFETIME_AVG_GIFT_AMT   -2.649e-04  7.332e-05  -3.613 0.000303 ***
## URBANICITYR              1.400e-01  7.348e-02   1.905 0.056726 .  
## URBANICITYS              1.861e-01  6.397e-02   2.909 0.003629 ** 
## URBANICITYT              2.048e-01  7.215e-02   2.839 0.004526 ** 
## DONOR_GENDERF            8.004e-03  6.782e-02   0.118 0.906050    
## DONOR_GENDERM           -3.556e-02  6.984e-02  -0.509 0.610638    
## HOME_OWNERH              2.347e-02  4.294e-02   0.547 0.584664    
## HOME_OWNERN              5.388e-02  6.862e-02   0.785 0.432306    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17432  on 15498  degrees of freedom
## Residual deviance: 17079  on 15477  degrees of freedom
## AIC: 17123
## 
## Number of Fisher Scoring iterations: 4
null_deviance <- 17432 
residual_deviance <- 17079
pct_diff <- (null_deviance - residual_deviance) / null_deviance * 100
pct_diff
## [1] 2.025011

Key findings

Based on the model, the following features are significant:

INCOME_LEVEL - those with a higher income level are likely to make a donation

MEDIAN_HOME_VALUE - individuals with a higher median home value likely have a higher median household income and are likely to donate.

MONTHS_SINCE_LAST_GIFT - if its been long since last gift, its likely that there would be no donation.

LIFETIME_GIFT_COUNT - The more gift counts the more likely to make a donation as we saw in the correlation plot.

LIFETIME_AVG_GIFT_AMT - higher less likely to donate becasue im assuming these individuals make big contributions once in a while

URBANICITYS & URBANICITYT - more donors are likley to come from subsurban areas or small towns as compared to Large urba cities and rural areas which are high stress areas.

The null deviance : 1.7432^{4} and the residual deviance 1.7079^{4} give a difference of 2.0250115% which means that the model only learned 2.0250115%

Create a confusion matrix

# evaluate the performance of the model using the validation data set
prediction_prob <- predict(model, newdata = test_data, type = "response")
summary(prediction_prob) # gives min,max,mean of predicted probabilities.
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000206 0.2044861 0.2418318 0.2477663 0.2807676 0.7192983
prediction_class <- ifelse(prediction_prob > 0.3, TRUE, FALSE) # my dataset is unequal with more FALSE than TRUE, hence i lowered the threshold to 0.3 from 0.5

# lets create a confusion matrix
confusion_mtx <- confusionMatrix(
  as.factor(prediction_class),
  as.factor(test_data$DONATED),
  positive = 'TRUE'
)
print(confusion_mtx)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2485  711
##      TRUE    420  257
##                                           
##                Accuracy : 0.708           
##                  95% CI : (0.6934, 0.7223)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1344          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.26550         
##             Specificity : 0.85542         
##          Pos Pred Value : 0.37962         
##          Neg Pred Value : 0.77753         
##              Prevalence : 0.24994         
##          Detection Rate : 0.06636         
##    Detection Prevalence : 0.17480         
##       Balanced Accuracy : 0.56046         
##                                           
##        'Positive' Class : TRUE            
## 
# extract the output
accuracy <- confusion_mtx$overall ['Accuracy']
true_pos_rate <- confusion_mtx$byClass ['Sensitivity']
true_neg_rate <- confusion_mtx$byClass ['Specificity']

mean_TRUE_pred <- round(0.2477663, 2) * 100

key findings

The model’s results are Accuracy : 71% with a TPR: 27% and a TNR: 86%.

The model captured 27% of actual donors and 86% of actual non-donors. However the data consisted of higher FALSE - non-donors than non-donors so the model didnt learn much hence a weak prediction.

The question is why the model missed 708 actual donors because the goal for the compaign is to get more donors which means more revenue.

In this model, the average of the positively or TRUE DONOR predictions is 25 which means that the model was not well trained.

Based on all the above, i conclude that the model, at a sensitivity of 27%, means that it failed to capture any new donors hence predicted FALSE for the majority of

individuals.This may likely be due to class imbalance or low predictive power in the features used in the model.

PLAN: improve the model

  • Based on the main problem which is class imbalance (the non-donors/FALSE are way higher than donors/TRUE), the best solution would be to find a way to balnce the training data.

-GOAL: to add more TRUE samples that the model can learn from but realistically where the non-donors will always outnumber the donors.

Logistic regression model on multiple random subsets of the data.

# Train the logistic regression model on multiple (different but same size) random subsets of the data to reduce variance and bias.
num_models <- 10
subset_size <- nrow(train_data) # multiple (different but same size) random subsets of the data
models<- list()

for (i in 1:num_models) {
  sample_indices <- sample(1:nrow(train_data),
                           size = subset_size,
                           replace = TRUE)
  subset <- train_data[sample_indices, ]
  weight_ratio <- sum(as.character(subset$DONATED) == "FALSE") / sum(as.character(subset$DONATED) == "TRUE")
  # train logistic regression with class weights
  models[[i]] <- glm(DONATED~.,
                     data = subset, 
                     family = 'binomial', 
                     weights = ifelse(subset$DONATED == "TRUE", weight_ratio, 1)
                     )
}
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

Key findings

10 models were created.

# obtain all predictions from the ten models
ten_predictions <- sapply(models, function (m){
  predict(m, newdata = test_data, type = 'response')
})
# get the average of all ten probabilities
ensemble_average <- rowMeans(ten_predictions)

# used a threshold of 0.3 because we want the model to capture more TRUE in a imbalanced dataset
ensemble_class <- ifelse(ensemble_average > 0.3, TRUE, FALSE)
ensemble_class <- as.factor(ensemble_class)

Expecting matrix of ten columns for each model and rows (number of test_data rows * 10 for the ten models)

# create a confusion matrix for the ensemble class
confusion_mtx_ensemble <- confusionMatrix(ensemble_class, test_data$DONATED, positive = "TRUE")
print(confusion_mtx_ensemble)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    24   11
##      TRUE   2881  957
##                                           
##                Accuracy : 0.2533          
##                  95% CI : (0.2397, 0.2673)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0016         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.988636        
##             Specificity : 0.008262        
##          Pos Pred Value : 0.249349        
##          Neg Pred Value : 0.685714        
##              Prevalence : 0.249935        
##          Detection Rate : 0.247095        
##    Detection Prevalence : 0.990963        
##       Balanced Accuracy : 0.498449        
##                                           
##        'Positive' Class : TRUE            
## 
# extract the output
accuracy <- confusion_mtx_ensemble$overall ['Accuracy']
true_pos_rate <- confusion_mtx_ensemble$byClass ['Sensitivity']
true_neg_rate <- confusion_mtx_ensemble$byClass ['Specificity']

Key findings

The model’s results are Accuracy : 25% with a TPR: 99% and a TNR: 1%.

The results suggest that the model became more aggressive in capturing the TRUE cases at 958 and completely ignored the FALSE cases at 2880. Hence poor performnace.

The data is imbalanced, lets see if we can address this using the F1 score.

Calculation of F1-SCORE

# lets calculate the f1 score
# 2 * ((precision * recall) / (precision + recall))
TP <- 958
FP <- 2880
FN <- 10

precision <- TP / (TP + FP)
precision
## [1] 0.2496092
recall <- TP / (TP + FN)
recall
## [1] 0.9896694
f1_score <- 2 * (
  (precision * recall) / (precision + recall)
)
f1_score
## [1] 0.3986683

Observations

With a recall of 0.99% and a precision of 0.25%, we get an f1 score of 0.399 which means that the model is willing to capture TRUE’s 99% of the time but it is wrong 25% of the time.

ACTION: Try a higher threshold?

Increased threshold to 0.5

# increase the threshold to 0.5
ensemble_class_05 <- ifelse(ensemble_average > 0.5, TRUE, FALSE)
ensemble_class_05 <- as.factor(ensemble_class_05)

# confusion matrix with a threshold of 0.5
confusion_mtx_ensemble_05 <- confusionMatrix(ensemble_class_05, test_data$DONATED, positive = "TRUE")
print(confusion_mtx_ensemble_05)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  1776  412
##      TRUE   1129  556
##                                           
##                Accuracy : 0.6021          
##                  95% CI : (0.5865, 0.6176)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.149           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.5744          
##             Specificity : 0.6114          
##          Pos Pred Value : 0.3300          
##          Neg Pred Value : 0.8117          
##              Prevalence : 0.2499          
##          Detection Rate : 0.1436          
##    Detection Prevalence : 0.4351          
##       Balanced Accuracy : 0.5929          
##                                           
##        'Positive' Class : TRUE            
## 
### Calculation of F1 SCORE for both 0.5  threshold
# 2 * ((precision * recall) / (precision + recall))
TP_05<- 554
FP_05 <- 1138
FN_05 <- 414

precision_05 <- TP_05 / (TP_05 + FP_05)
precision_05
## [1] 0.3274232
recall_05 <- TP_05 / (TP_05 + FN_05)
recall_05
## [1] 0.572314
f1_score_05 <- 2 * (
  (precision_05 * recall_05) / (precision_05 + recall_05)
)
f1_score_05
## [1] 0.4165414

Increased threshold to 0.7

# threshold 0.7
# increase the threshold to 0.5
ensemble_class_07 <- ifelse(ensemble_average > 0.7, TRUE, FALSE)
ensemble_class_07 <- as.factor(ensemble_class_07)

# confusion matrix with a threshold of 0.5
confusion_mtx_ensemble_07 <- confusionMatrix(ensemble_class_07, test_data$DONATED, positive = "TRUE")
confusion_mtx_ensemble_07
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2873  946
##      TRUE     32   22
##                                           
##                Accuracy : 0.7475          
##                  95% CI : (0.7335, 0.7611)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 0.6526          
##                                           
##                   Kappa : 0.0171          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.02273         
##             Specificity : 0.98898         
##          Pos Pred Value : 0.40741         
##          Neg Pred Value : 0.75229         
##              Prevalence : 0.24994         
##          Detection Rate : 0.00568         
##    Detection Prevalence : 0.01394         
##       Balanced Accuracy : 0.50586         
##                                           
##        'Positive' Class : TRUE            
## 
### Calculation of F1 SCORE for  0.7 threshold
# 2 * ((precision * recall) / (precision + recall))
TP_07<- 23
FP_07 <- 33
FN_07 <- 945

precision_07 <- TP_07 / (TP_07 + FP_07)
precision_07
## [1] 0.4107143
recall_07 <- TP_07 / (TP_07 + FN_07)
recall_07
## [1] 0.02376033
f1_score_07 <- 2 * (
  (precision_07 * recall_07) / (precision_07 + recall_07)
)
f1_score_07
## [1] 0.04492188

Key findings

With a threshold of 0.3, the F1 SCORE is very low so i increased the threshold to 0.5 and 0.7 and calculated them separately.

With a threshold of 0.5, the recall is 0.572 and precision of 0.327, we get an F1 SCORE of 0.417

With a threshold of 0.7, the recall is 0.024 and precision of 0.411, we get an F1 SCORE of 0.045

0.3 threshold has poor precision and very high recall.

0.7 has good precision but very poor recall.

Considering we have imbalanced data, 0.5 is the best threshold with an F1 SCORE of 0.417

Build random forest model

set.seed(123)

# Build the model
rf_model <- randomForest(
  DONATED~.,data = train_data,
  ntree = 100,
  classwt = c("FALSE" = 1, "TRUE"= 3), # this is because i want the model to capture more TRUE cases compared to FALSE cases.
  importance = TRUE
)

# evaluate with test data
rf_predictions <- predict(rf_model ,newdata = test_data)

# create a cofusion matrix for the random forest model
rf_confusion_matrix <- confusionMatrix(rf_predictions, test_data$DONATED, positive = 'TRUE')
rf_confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2893  957
##      TRUE     12   11
##                                           
##                Accuracy : 0.7498          
##                  95% CI : (0.7359, 0.7634)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 0.5234          
##                                           
##                   Kappa : 0.0107          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.011364        
##             Specificity : 0.995869        
##          Pos Pred Value : 0.478261        
##          Neg Pred Value : 0.751429        
##              Prevalence : 0.249935        
##          Detection Rate : 0.002840        
##    Detection Prevalence : 0.005939        
##       Balanced Accuracy : 0.503616        
##                                           
##        'Positive' Class : TRUE            
## 
# extract the output
accuracy <- rf_confusion_matrix$overall ['Accuracy']
true_pos_rate <- rf_confusion_matrix$byClass ['Sensitivity']
true_neg_rate <- rf_confusion_matrix$byClass ['Specificity']
plot(rf_model)

Key findings

The model has performed poorly with Accuracy : 75%, a TPR: 1% and a TNR: 100%.

The sensitivity has dropped meaning that the model can barely capture any TRUE donor cases.

ACTION: Modify the weight and ntrees.

set.seed(123)

# Build rf model II with increased weight for TRUE cases and increased ntrees to 500
rf_model_2 <- randomForest(
  DONATED~.,data = train_data,
  ntree = 500, # increased the number of trees to 500
  classwt = c("FALSE" = 1, "TRUE"= 5), # this is because i want the model to capture more TRUE cases compared to FALSE cases.
  importance = TRUE
)

# evaluate with test data
rf_predictions_2 <- predict(rf_model_2 ,newdata = test_data)

# create a cofusion matrix for the random forest model
rf_confusion_matrix_2 <- confusionMatrix(rf_predictions_2, test_data$DONATED, positive = 'TRUE')
rf_confusion_matrix_2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2895  958
##      TRUE     10   10
##                                           
##                Accuracy : 0.7501          
##                  95% CI : (0.7361, 0.7636)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 0.5086          
##                                           
##                   Kappa : 0.0102          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.010331        
##             Specificity : 0.996558        
##          Pos Pred Value : 0.500000        
##          Neg Pred Value : 0.751363        
##              Prevalence : 0.249935        
##          Detection Rate : 0.002582        
##    Detection Prevalence : 0.005164        
##       Balanced Accuracy : 0.503444        
##                                           
##        'Positive' Class : TRUE            
## 
# extract the output
accuracy <- rf_confusion_matrix_2$overall ['Accuracy']
true_pos_rate <- rf_confusion_matrix_2$byClass ['Sensitivity']
true_neg_rate <- rf_confusion_matrix_2$byClass ['Specificity']
plot(rf_model_2)

Key findings

The weight was increased for TRUE cases to 5 and FALSE cases maintained at 1. This is telling the model to put an emphasis on capturing TRUE cases more than FALSE.

Results show Accuracy : 75% with a TPR: 1% and a TNR: 100%.

The model does not learn anything and captures FALSE cases only missing 958 potential TRUE donors.

ACTION: Read about sampsize function that can be used instead of weights especially for an imbalanced data set.

For my case, sampsize would mean that each decision tree is forced to train on the a specific number of FALSE and TRUE cases.

There are 11624 FALSE cases and 3875 TRUE cases in training. so we can start with a baseline of 3875 FALSE and 3875 TRUE.

# Build rf model III 
rf_model_3 <- randomForest(
  DONATED~., 
  data = train_data,
  ntree = 500,
  sampsize = c("FALSE" = 3875, "TRUE" = 3875),
  importance = TRUE
)
rf_predictions_3 <- predict(rf_model_3, newdata = test_data, type = "prob")

# convert to class labels using a threshold of 0.5
rf_class_3 <- ifelse(rf_predictions_3[,"TRUE"] > 0.5, TRUE, FALSE)
rf_class_3 <- as.factor(rf_class_3)

# create a confusion matrix
rf_confusion_matrix_3 <- confusionMatrix(rf_class_3, test_data$DONATED, positive = "TRUE")
rf_confusion_matrix_3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2407  687
##      TRUE    498  281
##                                           
##                Accuracy : 0.694           
##                  95% CI : (0.6793, 0.7085)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1271          
##                                           
##  Mcnemar's Test P-Value : 4.726e-08       
##                                           
##             Sensitivity : 0.29029         
##             Specificity : 0.82857         
##          Pos Pred Value : 0.36072         
##          Neg Pred Value : 0.77796         
##              Prevalence : 0.24994         
##          Detection Rate : 0.07255         
##    Detection Prevalence : 0.20114         
##       Balanced Accuracy : 0.55943         
##                                           
##        'Positive' Class : TRUE            
## 
plot(rf_model_3)

# calculate F1 SCORE for rf model 3 with 0.5 as the threshold
TP_rf<- 292
FP_rf <- 503
FN_rf <- 676

precision_rf <- TP_rf / (TP_rf + FP_rf)
precision_rf
## [1] 0.3672956
recall_rf <- TP_rf / (TP_rf + FN_rf)
recall_rf
## [1] 0.3016529
f1_score_rf <- 2 * (
  (precision_rf * recall_rf) / (precision_rf + recall_rf)
)
f1_score_rf
## [1] 0.3312535

Key findings

The F1 SCORE for the ensemble random forest with ntrees at 500 and using sample size of 3875 for both TRUE cases and FALSE cases, is 0.331 , which is lower than the homogenous logistic regression that had an F1 SCORE of 0.417.

With a recall of 0.302 and precision of 0.367, means that the random forest model is able to capture TRUE cases at 0.302 but misses out on potential TRUE donors at 0.367 .

Build a heterogenous ensemble model

# we are using ensemble to combine ensemble_class_05 and rf_model_3
# 1. obtain probabilities for each model
logistic_reg_prob <- ensemble_average
random_fst_prob <- rf_predictions_3[,"TRUE"]

# 2. Assign weights based on performance. The F1 SCORE tells which model performed better
wght_logistic_reg <- f1_score_05 / (f1_score_05 + f1_score_rf)
wght_random_fst <- f1_score_rf / (f1_score_rf +  f1_score_05)

# 3. combine the weighted average
final_prob <- (logistic_reg_prob * wght_logistic_reg) + (random_fst_prob * wght_random_fst)
#final_prob

# 4. Apply threshold
final_class <- ifelse(final_prob > 0.5, TRUE, FALSE)
final_class <- as.factor(final_class)

# 5. confurion matrix
het_confusion_matrix <- confusionMatrix(final_class,test_data$DONATED, positive = "TRUE")
het_confusion_matrix
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE  2130  554
##      TRUE    775  414
##                                           
##                Accuracy : 0.6569          
##                  95% CI : (0.6417, 0.6718)
##     No Information Rate : 0.7501          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.1495          
##                                           
##  Mcnemar's Test P-Value : 1.592e-09       
##                                           
##             Sensitivity : 0.4277          
##             Specificity : 0.7332          
##          Pos Pred Value : 0.3482          
##          Neg Pred Value : 0.7936          
##              Prevalence : 0.2499          
##          Detection Rate : 0.1069          
##    Detection Prevalence : 0.3070          
##       Balanced Accuracy : 0.5805          
##                                           
##        'Positive' Class : TRUE            
## 

Calculate the F1 SCORE

# F1 SCORE for the heterogenous model
TP_het<- 422
FP_het <- 742
FN_het <- 546

precision_het <- TP_het / (TP_het + FP_het)
precision_het
## [1] 0.362543
recall_het <- TP_het / (TP_het + FN_het)
recall_het
## [1] 0.4359504
f1_score_het <- 2 * (
  (precision_het * recall_het) / (precision_het + recall_het)
)
f1_score_het
## [1] 0.3958724

Key findings

The heterogenous model was a combination of ensemble_class_05(LR) and rf_model_3(RF). It produced an F1 SCORE: 0.396 with a recall: 0.436 and

precision: 0.363.

It did not outperform the homogenous logistic regression model that had an F1 SCORE : 0.417.

However the random forest model was weaker at an F1 SCORE: 0.331 compared to the homogenous logistic regression at: 0.417.

It could have weakened the overall performance of the heterogenous model.

save my work

save.image(file = "MuchukiM.DA5030.Practicum2-PartA.RData")