Load required libraries

library('caret')
## Loading required package: ggplot2
## Loading required package: lattice
library('dplyr')
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library('ggplot2')
library('corrplot')
## corrplot 0.95 loaded
library('repr')

Load the data

zip_url <- "https://s3.us-east-2.amazonaws.com/artificium.us/datasets/donations-salted.csv.zip"
zip_file <- tempfile(fileext = ".zip")
download.file(zip_url, zip_file, mode = 'wb')
df <- read.csv(unz(zip_file, "donations-salted.csv"), header = TRUE, sep = ",")
head(df)

Data inspection

str(df)
## 'data.frame':    19372 obs. of  20 variables:
##  $ CONTROL_NUMBER         : int  5 12 37 38 41 52 53 67 70 71 ...
##  $ DONATED                : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ DONATION_AMT           : int  NA 1000 NA NA NA NA NA 500 NA 1600 ...
##  $ DONOR_AGE              : int  87 79 75 NA 74 63 71 79 41 63 ...
##  $ URBANICITY             : chr  "?" "R" "S" "U" ...
##  $ DONOR_GENDER           : chr  "M" "M" "F" "F" ...
##  $ HOME_OWNER             : chr  "H" "H" "H" "H" ...
##  $ INCOME_LEVEL           : int  2 7 5 6 2 3 5 1 4 4 ...
##  $ SES                    : chr  "." "45" "11" "4" ...
##  $ MEDIAN_HOME_VALUE      : num  554000 334000 2388000 1688000 514000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 8 0 0 0 0 0 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  26 7 6 6 18 19 21 9 17 9 ...
##  $ EMAILS_12              : int  17 28 43 44 25 29 25 33 36 38 ...
##  $ LIFETIME_GIFT_COUNT    : int  35 25 36 37 12 22 20 20 34 22 ...
##  $ LIFETIME_EMAILS        : int  19 32 44 31 30 22 29 33 36 27 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 500 500 300 500 300 500 300 300 300 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  849 1472 1675 1176 883 582 1100 505 503 682 ...
summary(df)
##  CONTROL_NUMBER    DONATED         DONATION_AMT     DONOR_AGE    
##  Min.   :     5   Mode :logical   Min.   :  100   Min.   : 0.00  
##  1st Qu.: 48289   FALSE:14529     1st Qu.: 1000   1st Qu.:47.00  
##  Median : 96937   TRUE :4843      Median : 1300   Median :60.00  
##  Mean   : 96546                   Mean   : 1562   Mean   :58.92  
##  3rd Qu.:145430                   3rd Qu.: 2000   3rd Qu.:73.00  
##  Max.   :191779                   Max.   :20000   Max.   :87.00  
##                                   NA's   :14529   NA's   :4795   
##   URBANICITY        DONOR_GENDER        HOME_OWNER         INCOME_LEVEL  
##  Length:19372       Length:19372       Length:19372       Min.   :1.000  
##  Class :character   Class :character   Class :character   1st Qu.:2.000  
##  Mode  :character   Mode  :character   Mode  :character   Median :4.000  
##                                                           Mean   :3.908  
##                                                           3rd Qu.:5.000  
##                                                           Max.   :7.000  
##                                                           NA's   :4392   
##      SES            MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
##  Length:19372       Min.   :      0   Min.   :      0         Min.   :  0.000  
##  Class :character   1st Qu.: 518000   1st Qu.: 232000         1st Qu.:  0.000  
##  Mode  :character   Median : 747000   Median : 311000         Median :  0.000  
##                     Mean   :1079872   Mean   : 341970         Mean   :  3.362  
##                     3rd Qu.:1227000   3rd Qu.: 417000         3rd Qu.:  3.000  
##                     Max.   :6000000   Max.   :1500000         Max.   :241.000  
##                                                                                
##  MONTHS_SINCE_LAST_GIFT   EMAILS_12     LIFETIME_GIFT_COUNT LIFETIME_EMAILS
##  Min.   : 4.00          Min.   : 2.00   Min.   : 1.00       Min.   : 2.00  
##  1st Qu.:16.00          1st Qu.:16.00   1st Qu.: 4.00       1st Qu.:11.00  
##  Median :18.00          Median :18.00   Median : 8.00       Median :18.00  
##  Mean   :18.19          Mean   :18.27   Mean   : 9.98       Mean   :18.67  
##  3rd Qu.:20.00          3rd Qu.:19.00   3rd Qu.:14.00       3rd Qu.:26.00  
##  Max.   :27.00          Max.   :77.00   Max.   :95.00       Max.   :56.00  
##                                                                            
##  LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT LIFETIME_MIN_GIFT_AMT
##  Min.   :  1500       Min.   :   500        Min.   :    0.0      
##  1st Qu.:  4200       1st Qu.:  1200        1st Qu.:  300.0      
##  Median :  7900       Median :  1600        Median :  500.0      
##  Mean   : 10443       Mean   :  1921        Mean   :  762.1      
##  3rd Qu.: 13200       3rd Qu.:  2100        3rd Qu.: 1000.0      
##  Max.   :377500       Max.   :100000        Max.   :45000.0      
##                                                                  
##  LIFETIME_AVG_GIFT_AMT
##  Min.   :  136        
##  1st Qu.:  800        
##  Median : 1120        
##  Mean   : 1286        
##  3rd Qu.: 1500        
##  Max.   :45000        
## 

Data exploration

Check for missing values

# check for NA values
colSums(is.na(df))
##          CONTROL_NUMBER                 DONATED            DONATION_AMT 
##                       0                       0                   14529 
##               DONOR_AGE              URBANICITY            DONOR_GENDER 
##                    4795                       0                       0 
##              HOME_OWNER            INCOME_LEVEL                     SES 
##                       0                    4392                       0 
##       MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME       DONATION_RESPONSE 
##                       0                       0                       0 
##  MONTHS_SINCE_LAST_GIFT               EMAILS_12     LIFETIME_GIFT_COUNT 
##                       0                       0                       0 
##         LIFETIME_EMAILS    LIFETIME_GIFT_AMOUNT   LIFETIME_MAX_GIFT_AMT 
##                       0                       0                       0 
##   LIFETIME_MIN_GIFT_AMT   LIFETIME_AVG_GIFT_AMT 
##                       0                       0
sum(is.na(df))
## [1] 23716
# check for empty strings
any(apply(df, 2, function(x) any(x == ""| x == " ")))
## [1] NA
# check for unknown entries in the data frame
colSums(df == "uknown"|df == "Uknown")
##          CONTROL_NUMBER                 DONATED            DONATION_AMT 
##                       0                       0                      NA 
##               DONOR_AGE              URBANICITY            DONOR_GENDER 
##                      NA                       0                       0 
##              HOME_OWNER            INCOME_LEVEL                     SES 
##                       0                      NA                       0 
##       MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME       DONATION_RESPONSE 
##                       0                       0                       0 
##  MONTHS_SINCE_LAST_GIFT               EMAILS_12     LIFETIME_GIFT_COUNT 
##                       0                       0                       0 
##         LIFETIME_EMAILS    LIFETIME_GIFT_AMOUNT   LIFETIME_MAX_GIFT_AMT 
##                       0                       0                       0 
##   LIFETIME_MIN_GIFT_AMT   LIFETIME_AVG_GIFT_AMT 
##                       0                       0

Key findings

Lets investigate columns : INCOME_LEVEL , DONOR_AGE

NOTE: the target variable has NA values.

There are no empty strings in the data frame.

There are no “Unknown” or “unknown” entries in the data frame. Yay!

# DONOR_AGE - the age of donors in years
unique(df$DONOR_AGE)
##  [1] 87 79 75 NA 74 63 71 41 73 81 76 69 45 43 55 77 37 52 30 67 65 59 42 58 56
## [26] 80 82 64 48 85 47 70 40 32 53 86 54 33 60 49  7 17 57 78 50 39 66 51 72 38
## [51] 61 68 24 16 84 44 34 83 35 29 26 27 46 31 62 23 36 18  6 25 28 19 21 22  0
## [76]  2 12 20 15  4 14
sum(is.na(df$DONOR_AGE))
## [1] 4795
# lets look at the rows with NA values
df[is.na(df$DONOR_AGE), ]
table(df$DONOR_AGE)
## 
##   0   2   4   6   7  12  14  15  16  17  18  19  20  21  22  23  24  25  26  27 
##   2   3   1  10  74   1   1   2  11 101   2   5   1  13   9  23  10  46  35 124 
##  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45  46  47 
##  37  79  55 111  65 137  80 202 117 282 112 244 160 261 179 278 170 343 178 358 
##  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67 
## 212 404 237 364 177 321 212 361 181 365 186 320 169 264 208 271 178 347 228 411 
##  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87 
## 231 343 225 350 239 395 295 380 281 381 251 326 215 276 192 239 140 174 144 162
summary(df$DONOR_AGE)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   47.00   60.00   58.92   73.00   87.00    4795
# sum of donors who are 18 years and below
sum(df$DONOR_AGE < 18, na.rm = TRUE) 
## [1] 206
# lets use a threshold of 18 and equate all donors <18 to NA for age
df$DONOR_AGE[df$DONOR_AGE <18] <- NA 

# impute the NA with the median age
median_age <- median(df$DONOR_AGE, na.rm = TRUE)
df$DONOR_AGE[is.na(df$DONOR_AGE)] <- median_age

# verify the change
sum(is.na(df$DONOR_AGE))
## [1] 0
table(df$DONOR_AGE)
## 
##   18   19   20   21   22   23   24   25   26   27   28   29   30   31   32   33 
##    2    5    1   13    9   23   10   46   35  124   37   79   55  111   65  137 
##   34   35   36   37   38   39   40   41   42   43   44   45   46   47   48   49 
##   80  202  117  282  112  244  160  261  179  278  170  343  178  358  212  404 
##   50   51   52   53   54   55   56   57   58   59   60   61   62   63   64   65 
##  237  364  177  321  212  361  181  365  186  320 5170  264  208  271  178  347 
##   66   67   68   69   70   71   72   73   74   75   76   77   78   79   80   81 
##  228  411  231  343  225  350  239  395  295  380  281  381  251  326  215  276 
##   82   83   84   85   86   87 
##  192  239  140  174  144  162
sum(df$DONOR_AGE < 18,na.rm = TRUE)
## [1] 0
colSums(is.na(df))
##          CONTROL_NUMBER                 DONATED            DONATION_AMT 
##                       0                       0                   14529 
##               DONOR_AGE              URBANICITY            DONOR_GENDER 
##                       0                       0                       0 
##              HOME_OWNER            INCOME_LEVEL                     SES 
##                       0                    4392                       0 
##       MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME       DONATION_RESPONSE 
##                       0                       0                       0 
##  MONTHS_SINCE_LAST_GIFT               EMAILS_12     LIFETIME_GIFT_COUNT 
##                       0                       0                       0 
##         LIFETIME_EMAILS    LIFETIME_GIFT_AMOUNT   LIFETIME_MAX_GIFT_AMT 
##                       0                       0                       0 
##   LIFETIME_MIN_GIFT_AMT   LIFETIME_AVG_GIFT_AMT 
##                       0                       0

Key findings

There are 4795 NA values in DONOR_AGE

There is a donor with 0 age which is likely a data entry error.

ACTION: I set a threshold of 18 and all DONOR-AGE below 18 were set to NA then imputed all NA values with the median age.

# INCOME_LEVEL: one of 7 income levels; higher means more income; exact bracket structure is unknown
unique(df$INCOME_LEVEL)
## [1]  2  7  5  6  3  1  4 NA
table(df$INCOME_LEVEL)
## 
##    1    2    3    4    5    6    7 
## 1822 2626 1699 2526 3149 1591 1567
sum(is.na(df$INCOME_LEVEL))
## [1] 4392
# check the rows that have na INCOME_LEVEL
df[is.na(df$INCOME_LEVEL), ]
# check what levsls 1- 7 mean using the donation amount col
tapply(df$DONATION_AMT, df$INCOME_LEVEL, mean, na.rm = TRUE)
##        1        2        3        4        5        6        7 
## 1342.287 1378.962 1434.752 1528.672 1650.825 1746.520 1769.484
# imput the the NA values with the median
median_income <- median(df$INCOME_LEVEL, na.rm = TRUE)
df$INCOME_LEVEL[is.na(df$INCOME_LEVEL)] <- median_income
# verify the change
sum(is.na(df$INCOME_LEVEL))
## [1] 0
colSums(is.na(df))
##          CONTROL_NUMBER                 DONATED            DONATION_AMT 
##                       0                       0                   14529 
##               DONOR_AGE              URBANICITY            DONOR_GENDER 
##                       0                       0                       0 
##              HOME_OWNER            INCOME_LEVEL                     SES 
##                       0                       0                       0 
##       MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME       DONATION_RESPONSE 
##                       0                       0                       0 
##  MONTHS_SINCE_LAST_GIFT               EMAILS_12     LIFETIME_GIFT_COUNT 
##                       0                       0                       0 
##         LIFETIME_EMAILS    LIFETIME_GIFT_AMOUNT   LIFETIME_MAX_GIFT_AMT 
##                       0                       0                       0 
##   LIFETIME_MIN_GIFT_AMT   LIFETIME_AVG_GIFT_AMT 
##                       0                       0

Key findings

There are 7 levels of income.

In the INCOME_LEVEL column there are 0 NA values

The rows with NA INCOME_LEVEL entries have no specific patttern.

For the seven income levels; 1 is the least with 277.0033 and 7 is the highest with 481.0466.

ACTION TAKEN:

Impute the NA values with the median of the INCOME_LEVEl column.

The INCOME_LEVEL column is ordinal and the differences between the levels of income differ hence the median would be the best to imput the NA values.

# plot the INCOME_LEVEL
ggplot(df, aes(x = INCOME_LEVEL)) +
  geom_bar(fill = "steelblue") +
  labs(title="Distribution of the levels of income") +
  theme(axis.text = element_text(angle = 45, vjust = 0.5, hjust = 1))

#### Observation

There is an increase in income level 4 after imputing NA values with the median income.

# SES: socio-economic status for donor cluster; higher number means a higher socio-economic status
unique(df$SES)
##  [1] "."  "45" "11" "4"  "49" "8"  "50" "28" "30" "43" "53" "42" "46" "20" "16"
## [16] "40" "7"  "34" "23" "35" "41" "25" "10" "1"  "9"  "2"  "12" "14" "37" "36"
## [31] "15" "39" "38" "18" "48" "24" "3"  "13" "31" "5"  "27" "19" "51" "22" "17"
## [46] "26" "21" "44" "6"  "29" "33" "47" "32" "52"
table(unique(df$SES))
## 
##  .  1 10 11 12 13 14 15 16 17 18 19  2 20 21 22 23 24 25 26 27 28 29  3 30 31 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
## 32 33 34 35 36 37 38 39  4 40 41 42 43 44 45 46 47 48 49  5 50 51 52 53  6  7 
##  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1  1 
##  8  9 
##  1  1
# compare SES col with DONATION AMT
tapply(df$DONATION_AMT, df$SES, mean, na.rm = TRUE)
##        .        1       10       11       12       13       14       15 
## 2043.007 2074.627 1556.522 1684.252 1638.125 2033.918 1675.984 1827.885 
##       16       17       18       19        2       20       21       22 
## 1351.923 1533.721 1549.702 1320.000 1565.152 1519.355 1836.824 1651.538 
##       23       24       25       26       27       28       29        3 
## 1523.239 1571.065 1432.812 1337.255 1519.880 1533.028 1920.000 1852.059 
##       30       31       32       33       34       35       36       37 
## 1209.174 1562.903 1507.407 1211.538 1632.353 1702.475 1357.838 1560.465 
##       38       39        4       40       41       42       43       44 
## 1450.769 1504.839 1500.000 1462.979 1420.833 1618.589 1388.462 1554.730 
##       45       46       47       48       49        5       50       51 
## 1318.019 1443.269 1241.176 1293.750 1465.541 1612.766 1662.857 1269.307 
##       52       53        6        7        8        9 
## 1076.923 1445.455 1727.273 1813.636 1495.000 1326.471
# find the mode of the SES column
mode_SES <- as.numeric(sort(table(df$SES), decreasing = TRUE)[1])
df$SES[df$SES == "."] <- mode_SES
# verify the changes
table(unique(df$SES))
## 
##   1  10  11  12  13  14  15  16  17  18  19   2  20  21  22  23  24  25  26  27 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  28  29   3  30  31  32  33  34  35  36  37  38  39   4  40  41  42  43  44  45 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1   1 
##  46  47  48  49   5  50  51  52  53   6   7   8 830   9 
##   1   1   1   1   1   1   1   1   1   1   1   1   1   1
df$SES <- as.numeric(df$SES)

Key findings

The SES ranges from 1 to 830 -> Wide range!

There is no specific pattern between SES and DONATION_AMT

There is 1 “.” - This is not an appropriate data entry for the model.

# URBANICITY: the “urbanicity” of the donor’s place of residence: 
# U = urban (i.e., lives in a (large) city), 
# S = suburban (outside but near the city), 
# T = town (lives in a “small” town), 
# R = rural (e.g., farm, away from town) , 
# ? = unknown
unique(df$URBANICITY)
## [1] "?" "R" "S" "U" "C" "T"
table(df$URBANICITY, useNA = "always")
## 
##    ?    C    R    S    T    U <NA> 
##  454 4022 4005 4491 3944 2456    0
sum(df$URBANICITY == "?")
## [1] 454
sum(df$URBANICITY == "C")
## [1] 4022
# The metadata does not have "C"
# I will combine the '?' and 'C' as NA values 
df$URBANICITY[df$URBANICITY == "C"] <- NA
df$URBANICITY[df$URBANICITY == "?"] <- NA
# verify the change
sum(df$URBANICITY == "?")
## [1] NA
sum(df$URBANICITY == "C")
## [1] NA
# impute the NA with the mode
mode_URBANICITY <- names(sort(table(df$URBANICITY), decreasing = TRUE)[1])
df$URBANICITY[is.na(df$URBANICITY)] <- mode_URBANICITY

# verify the change
table(df$URBANICITY, useNA = "always")
## 
##    R    S    T    U <NA> 
## 4005 8967 3944 2456    0
unique(df$URBANICITY)
## [1] "S" "R" "U" "T"
table(is.na(df$URBANICITY))
## 
## FALSE 
## 19372
# impute  the NA entries to numerical values
dummies <- dummyVars(~URBANICITY, data = df)
URBANICITY_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
  df %>% select(-URBANICITY), URBANICITY_encoded
)

# verify for any NA 
sum(is.na(df$URBANICITYR))
## [1] 0
sum(is.na(df$URBANICITYS))
## [1] 0
sum(is.na(df$URBANICITYT))
## [1] 0
sum(is.na(df$URBANICITYU))
## [1] 0

Key findings

There is “?” which could be a data entry error and can be NA

The metadata does not have “C” but the data set has “C”

Combined both “C” and “?” as one column and changed to NA

Imputed the NA values with the mode of URBANICUTY

Used dummy coding for the column URBANICITY for the model.

Checked to verify the changes.

# lets explore the binary columns - DONATED , DONOR_GENDER, HOME_OWNER
table(unique(df$DONOR_GENDER))
## 
## F M U 
## 1 1 1
table(unique(df$HOME_OWNER))
## 
## H N U 
## 1 1 1
# DONOR-GENEDER is nominal categorical 
# use one hot encoding
dummies <- dummyVars(~DONOR_GENDER, data = df)
DONOR_GENDER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
  df %>% select(-DONOR_GENDER), DONOR_GENDER_encoded
)
# verify the changes
colnames(df)
##  [1] "CONTROL_NUMBER"          "DONATED"                
##  [3] "DONATION_AMT"            "DONOR_AGE"              
##  [5] "HOME_OWNER"              "INCOME_LEVEL"           
##  [7] "SES"                     "MEDIAN_HOME_VALUE"      
##  [9] "MEDIAN_HOUSEHOLD_INCOME" "DONATION_RESPONSE"      
## [11] "MONTHS_SINCE_LAST_GIFT"  "EMAILS_12"              
## [13] "LIFETIME_GIFT_COUNT"     "LIFETIME_EMAILS"        
## [15] "LIFETIME_GIFT_AMOUNT"    "LIFETIME_MAX_GIFT_AMT"  
## [17] "LIFETIME_MIN_GIFT_AMT"   "LIFETIME_AVG_GIFT_AMT"  
## [19] "URBANICITYR"             "URBANICITYS"            
## [21] "URBANICITYT"             "URBANICITYU"            
## [23] "DONOR_GENDERF"           "DONOR_GENDERM"          
## [25] "DONOR_GENDERU"

Key findings

There is “U” - unknown for gender. This could be legitimate data. Probably the gender was not recorded during data entry

It is a nominal categorical- I’ll use one hot encoding for the model and verify the changes.

# HOME_OWNER: whether donor is a home owner or not: H = yes, U = unknown, N = no
# use one-hot-encoding
dummies <- dummyVars(~HOME_OWNER, data = df)
HOME_OWNER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
  df%>% select(-HOME_OWNER), HOME_OWNER_encoded
)
# verify the changes
colnames(df)
##  [1] "CONTROL_NUMBER"          "DONATED"                
##  [3] "DONATION_AMT"            "DONOR_AGE"              
##  [5] "INCOME_LEVEL"            "SES"                    
##  [7] "MEDIAN_HOME_VALUE"       "MEDIAN_HOUSEHOLD_INCOME"
##  [9] "DONATION_RESPONSE"       "MONTHS_SINCE_LAST_GIFT" 
## [11] "EMAILS_12"               "LIFETIME_GIFT_COUNT"    
## [13] "LIFETIME_EMAILS"         "LIFETIME_GIFT_AMOUNT"   
## [15] "LIFETIME_MAX_GIFT_AMT"   "LIFETIME_MIN_GIFT_AMT"  
## [17] "LIFETIME_AVG_GIFT_AMT"   "URBANICITYR"            
## [19] "URBANICITYS"             "URBANICITYT"            
## [21] "URBANICITYU"             "DONOR_GENDERF"          
## [23] "DONOR_GENDERM"           "DONOR_GENDERU"          
## [25] "HOME_OWNERH"             "HOME_OWNERN"            
## [27] "HOME_OWNERU"

Key findings

HOME_OWNER is a nominal categorical column. I’ll use one hot encoding for the model and verify the changes.

# check the MEDIAN_HOME_VALUE
table(df$MEDIAN_HOME_VALUE[df$MEDIAN_HOME_VALUE == 0])
## 
##   0 
## 218

Key findings

There are 218 count of zero median home value.

Is this possible? turns out -> a median home price of $0 indicates that 50% or more of the analyzed transactions in a particular, small dataset involved a $0 exchange of value.

So this is not a data entry error as i had previously thought in Part A.

confirm all changes

str(df)
## 'data.frame':    19372 obs. of  27 variables:
##  $ CONTROL_NUMBER         : int  5 12 37 38 41 52 53 67 70 71 ...
##  $ DONATED                : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ DONATION_AMT           : int  NA 1000 NA NA NA NA NA 500 NA 1600 ...
##  $ DONOR_AGE              : int  87 79 75 60 74 63 71 79 41 63 ...
##  $ INCOME_LEVEL           : num  2 7 5 6 2 3 5 1 4 4 ...
##  $ SES                    : num  830 45 11 4 49 8 50 28 30 43 ...
##  $ MEDIAN_HOME_VALUE      : num  554000 334000 2388000 1688000 514000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 8 0 0 0 0 0 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  26 7 6 6 18 19 21 9 17 9 ...
##  $ EMAILS_12              : int  17 28 43 44 25 29 25 33 36 38 ...
##  $ LIFETIME_GIFT_COUNT    : int  35 25 36 37 12 22 20 20 34 22 ...
##  $ LIFETIME_EMAILS        : int  19 32 44 31 30 22 29 33 36 27 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 500 500 300 500 300 500 300 300 300 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  849 1472 1675 1176 883 582 1100 505 503 682 ...
##  $ URBANICITYR            : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ URBANICITYS            : num  1 0 1 0 0 0 0 1 1 0 ...
##  $ URBANICITYT            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ URBANICITYU            : num  0 0 0 1 0 1 0 0 0 0 ...
##  $ DONOR_GENDERF          : num  0 0 1 1 1 0 0 1 1 1 ...
##  $ DONOR_GENDERM          : num  1 1 0 0 0 1 1 0 0 0 ...
##  $ DONOR_GENDERU          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOME_OWNERH            : num  1 1 1 1 0 0 1 1 1 1 ...
##  $ HOME_OWNERN            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ HOME_OWNERU            : num  0 0 0 0 1 1 0 0 0 0 ...

Set comparison columns to NULL to make it easier for the model

df$URBANICITYU <- NULL
df$DONOR_GENDERU <- NULL
df$HOME_OWNERU <- NULL
# Remove columns that do not benefit the model
df$CONTROL_NUMBER <- NULL

# Verify the changes
str(df)
## 'data.frame':    19372 obs. of  23 variables:
##  $ DONATED                : logi  FALSE TRUE FALSE FALSE FALSE FALSE ...
##  $ DONATION_AMT           : int  NA 1000 NA NA NA NA NA 500 NA 1600 ...
##  $ DONOR_AGE              : int  87 79 75 60 74 63 71 79 41 63 ...
##  $ INCOME_LEVEL           : num  2 7 5 6 2 3 5 1 4 4 ...
##  $ SES                    : num  830 45 11 4 49 8 50 28 30 43 ...
##  $ MEDIAN_HOME_VALUE      : num  554000 334000 2388000 1688000 514000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 8 0 0 0 0 0 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  26 7 6 6 18 19 21 9 17 9 ...
##  $ EMAILS_12              : int  17 28 43 44 25 29 25 33 36 38 ...
##  $ LIFETIME_GIFT_COUNT    : int  35 25 36 37 12 22 20 20 34 22 ...
##  $ LIFETIME_EMAILS        : int  19 32 44 31 30 22 29 33 36 27 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 500 500 300 500 300 500 300 300 300 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  849 1472 1675 1176 883 582 1100 505 503 682 ...
##  $ URBANICITYR            : num  0 1 0 0 1 0 1 0 0 1 ...
##  $ URBANICITYS            : num  1 0 1 0 0 0 0 1 1 0 ...
##  $ URBANICITYT            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ DONOR_GENDERF          : num  0 0 1 1 1 0 0 1 1 1 ...
##  $ DONOR_GENDERM          : num  1 1 0 0 0 1 1 0 0 0 ...
##  $ HOME_OWNERH            : num  1 1 1 1 0 0 1 1 1 1 ...
##  $ HOME_OWNERN            : num  0 0 0 0 0 0 0 0 0 0 ...

Key findings

I notice that there are NA values in DONATION_AMT

This is the target variable so let me explore that before visualization

table(df$DONATION_AMT)
## 
##   100   200   250   300   400   450   500   525   600   700   750   800   900 
##     7    21     1    86    76     1   503     1   124   126     1   114    71 
##  1000  1070  1100  1200  1250  1300  1392  1400  1500  1600  1687  1700  1750 
##   941     1   118   161    11    64     1    81   591    85     1    63     1 
##  1800  1825  1900  2000  2100  2200  2300  2400  2500  2600  2700  2800  2900 
##    37     1    27   577    92    23    40    13   392    26     9     6     2 
##  3000  3100  3200  3300  3400  3500  3600  3700  3800  4000  4100  4200  4300 
##    99     3    12     3     2    39     9     6     8    27     2     2     2 
##  4400  4421  4500  4600  4700  4800  5000  5100  5300  5500  6000  7500  9500 
##     2     1     8     2     5     1    68     3     2     1     3     7     1 
## 10000 10100 10200 15000 20000 
##    22     1     1     1     4
table(df$DONATED)
## 
## FALSE  TRUE 
## 14529  4843

Key findings

The target variable has NA values.

The DONATED col has 14,529 people who didnt donate and 4,843 of people who donated.

Hence the NA values in the DONATION_AMT column are the 14,529 people who didnt donate.

The goal of this model is to predict the target feature “DONATION_AMOUNT” for those who donate.

ACTION: Filter the data to only those who donated inorder to make predictions of the amount.

# set DONATED to null
df$DONATED <- NULL
# Filter data to donors only
donors_only <- df %>% filter(DONATION_AMT >0)
# verify the changes
table(donors_only$DONATION_AMT)
## 
##   100   200   250   300   400   450   500   525   600   700   750   800   900 
##     7    21     1    86    76     1   503     1   124   126     1   114    71 
##  1000  1070  1100  1200  1250  1300  1392  1400  1500  1600  1687  1700  1750 
##   941     1   118   161    11    64     1    81   591    85     1    63     1 
##  1800  1825  1900  2000  2100  2200  2300  2400  2500  2600  2700  2800  2900 
##    37     1    27   577    92    23    40    13   392    26     9     6     2 
##  3000  3100  3200  3300  3400  3500  3600  3700  3800  4000  4100  4200  4300 
##    99     3    12     3     2    39     9     6     8    27     2     2     2 
##  4400  4421  4500  4600  4700  4800  5000  5100  5300  5500  6000  7500  9500 
##     2     1     8     2     5     1    68     3     2     1     3     7     1 
## 10000 10100 10200 15000 20000 
##    22     1     1     1     4
str(donors_only)
## 'data.frame':    4843 obs. of  22 variables:
##  $ DONATION_AMT           : int  1000 500 1600 300 1200 800 1500 1000 3300 2000 ...
##  $ DONOR_AGE              : int  79 79 63 60 81 45 87 77 37 30 ...
##  $ INCOME_LEVEL           : num  7 1 4 4 4 2 1 4 5 6 ...
##  $ SES                    : num  45 28 43 43 45 30 20 34 23 35 ...
##  $ MEDIAN_HOME_VALUE      : num  334000 1004000 399000 475000 530000 ...
##  $ MEDIAN_HOUSEHOLD_INCOME: num  212000 189000 307000 227000 236000 387000 222000 331000 327000 544000 ...
##  $ DONATION_RESPONSE      : int  0 0 0 0 24 4 6 0 7 11 ...
##  $ MONTHS_SINCE_LAST_GIFT : int  7 9 9 22 6 5 17 5 8 20 ...
##  $ EMAILS_12              : int  28 33 38 31 40 34 34 40 38 19 ...
##  $ LIFETIME_GIFT_COUNT    : int  25 20 22 31 29 22 24 16 21 9 ...
##  $ LIFETIME_EMAILS        : int  32 33 27 44 33 29 39 33 40 13 ...
##  $ LIFETIME_GIFT_AMOUNT   : num  36800 10100 15000 18600 27000 12200 26100 18600 47000 7600 ...
##  $ LIFETIME_MAX_GIFT_AMT  : num  2500 700 1100 1100 1500 800 1500 1700 3500 2000 ...
##  $ LIFETIME_MIN_GIFT_AMT  : int  500 300 300 200 500 300 300 500 1000 500 ...
##  $ LIFETIME_AVG_GIFT_AMT  : int  1472 505 682 600 931 555 1088 1163 2238 844 ...
##  $ URBANICITYR            : num  1 0 1 1 1 0 0 0 0 0 ...
##  $ URBANICITYS            : num  0 1 0 0 0 1 1 0 1 0 ...
##  $ URBANICITYT            : num  0 0 0 0 0 0 0 1 0 1 ...
##  $ DONOR_GENDERF          : num  0 1 1 1 0 0 1 0 1 0 ...
##  $ DONOR_GENDERM          : num  1 0 0 0 1 1 0 0 0 1 ...
##  $ HOME_OWNERH            : num  1 1 1 0 0 0 0 0 0 1 ...
##  $ HOME_OWNERN            : num  0 0 0 0 0 1 0 0 0 0 ...

Key findings

working with 4843 observations(people who donated only) and 22 variables

VISUALIZATION

# distribution of target variable
ggplot(donors_only, aes(x = DONATION_AMT)) +
  geom_bar(fill = "steelblue") +
  labs(title="Distribution of donation amounts") +
  theme(axis.text = element_text(angle = 45, vjust = 0.5, hjust = 1))

#### Key findings

The data is right skewed which in reality it is expected as in donation campaigns, there are more small contributions with very few large or high contributions.

The bar plot shows most donors giving between 1 and 5000 amounts with very few donors giving between 5001 and 10,000.

To create the model i will use log transformation to compress the scale.

First I’ll create a correlation plot of numerical variables.

# make the plot larger so the numbers are readable
options(repr.plot.width = 15, repr.plot.height = 15)
numeric_cols <- donors_only %>%
  select(where(is.numeric))
# correlation plot
par(mar = c(2,2,2,2)) # expand margins because the plot is smalL
# visualize the corr plot
corrplot(cor(numeric_cols),
         method = 'color',
         type = 'upper',
         addCoef.col = 'black',
         tl.srt = 45,
         number.cex = 0.7,
         tl.cex = 0.6
         )

#### Key findings

STRONG PREDICTORS

LIFETIME_AVG_GIFT_AMT, LIFETIME_MAX_GIFT_AMT -> Individuals who have a history of giving are likely to give in the future.

LIFETIME_EMAILS - a person is likely to give when they receive an email about the campaign

LIFETIME_GIFT_COUNT -> The more a person has given in the past, the more they are likely to give in the future.

There is multicollinearity between:

LIFETIME_GIFT_COUNT and LIFETIME_EMAILS

DONOR_GENDERF and DONOR_GENDERM

MEDIAN_HOME_VALUE and MEDIAN_HOUSEHOLD_INCOME
cor_matrix <- cor(numeric_cols, use = "complete.obs")
donation_cors <- sort(cor_matrix["DONATION_AMT",], decreasing = TRUE)
print(round(donation_cors, 3))
##            DONATION_AMT   LIFETIME_AVG_GIFT_AMT   LIFETIME_MAX_GIFT_AMT 
##                   1.000                   0.517                   0.443 
##   LIFETIME_MIN_GIFT_AMT    LIFETIME_GIFT_AMOUNT       MEDIAN_HOME_VALUE 
##                   0.296                   0.248                   0.126 
## MEDIAN_HOUSEHOLD_INCOME            INCOME_LEVEL  MONTHS_SINCE_LAST_GIFT 
##                   0.118                   0.105                   0.090 
##                     SES               EMAILS_12             URBANICITYS 
##                   0.058                   0.055                   0.042 
##           DONOR_GENDERM             HOME_OWNERH       DONATION_RESPONSE 
##                   0.038                   0.013                   0.013 
##             URBANICITYT           DONOR_GENDERF             HOME_OWNERN 
##                  -0.022                  -0.040                  -0.044 
##               DONOR_AGE             URBANICITYR         LIFETIME_EMAILS 
##                  -0.045                  -0.056                  -0.098 
##     LIFETIME_GIFT_COUNT 
##                  -0.220

Key findings

STRONG PREDICTORS:

LIFETIME_AVG_GIFT_AMT - 0.517
LIFETIME_MAX_GIFT_AMT - 0.443
LIFETIME_MIN_GIFT_AMT - 0.296
LIFETIME_GIFT_AMOUNT - 0.248

WEAK PREDICTORS:

MEDIAN_HOME_VALUE
MEDIAN_HOUSEHOLD_INCOME
INCOME_LEVEL
MONTHS_SINCE_LAST_GIFT  
SES
EMAILS_12   

NEGATIVE PREDICTORS:

URBANICITY
GENDER
HOME OWNER
LIFETIME_GIFT_EMAILS
LIFETIME_GIFT_COUNT

LIFETIME_AVG_GIFT_AMT,LIFETIME_MAX_GIFT_AMT, LIFETIME_MIN_GIFT_AMT, LIFETIME_GIFT_AMOUNT all seem to predict the donation amount so i will choose LIFETIME_AVG_GIFT_AMT instead of having all four.

For the weak predictors i will keep:

MEDIAN_HOME_VALUE - beacsue a person who is able to afford a good home is likley able to donate. Hifgher socio-economic status individuals are likely to make a higher donation     compared to lower socio-economic class individuals

INCOME_LEVEL - higher income level equals to higher SES which equals to higher MEDIAN_HOME_VALUE

NB: MEDIAN_HOME_VALUE, INCOME_LEVEL, SES, & MEDIAN_HOUSEHOLD_INCOME all seem to form a chain where one affects the other other so ill choose one instead of having the four. 


MONTHS_SINCE_LAST_GIFT - if last donation was more current, the individuals are liklly to donate but ammount can be varying.

ACTION:

I'll drop MEDIAN_HOME_VALUE, MEDIAN_HOUSEHOLD_INCOME, SES, LIFETIME_MAX_GIFT_AMT, LIFETIME_MIN_GIFT_AMT,  LIFETIME_GIFT_AMOUNT for the model. 

For URBANICITY DUMMIES, HOMEOWNER and DONOR_GENDER, I'll use one as a reference foe the others not listed. For example DONOR_GENDERM is listed and any unidivual who us not male     is either female or gender not listed(UNKNOWN)
donors_only_model <- donors_only %>%
  select("DONATION_AMT",
         "DONOR_AGE",
         "INCOME_LEVEL",
         "DONATION_RESPONSE",
         "MONTHS_SINCE_LAST_GIFT",
         "EMAILS_12",
         "LIFETIME_AVG_GIFT_AMT",
         "URBANICITYR",
         "DONOR_GENDERM",
         "HOME_OWNERH",
         "LIFETIME_EMAILS"
         )
str(donors_only_model)
## 'data.frame':    4843 obs. of  11 variables:
##  $ DONATION_AMT          : int  1000 500 1600 300 1200 800 1500 1000 3300 2000 ...
##  $ DONOR_AGE             : int  79 79 63 60 81 45 87 77 37 30 ...
##  $ INCOME_LEVEL          : num  7 1 4 4 4 2 1 4 5 6 ...
##  $ DONATION_RESPONSE     : int  0 0 0 0 24 4 6 0 7 11 ...
##  $ MONTHS_SINCE_LAST_GIFT: int  7 9 9 22 6 5 17 5 8 20 ...
##  $ EMAILS_12             : int  28 33 38 31 40 34 34 40 38 19 ...
##  $ LIFETIME_AVG_GIFT_AMT : int  1472 505 682 600 931 555 1088 1163 2238 844 ...
##  $ URBANICITYR           : num  1 0 1 1 1 0 0 0 0 0 ...
##  $ DONOR_GENDERM         : num  1 0 0 0 1 1 0 0 0 1 ...
##  $ HOME_OWNERH           : num  1 1 1 0 0 0 0 0 0 1 ...
##  $ LIFETIME_EMAILS       : int  32 33 27 44 33 29 39 33 40 13 ...

log transformation

The data is right skewed, the best solution would be log transformation

Anthropic. (2025). Claude. Claude.ai. https://claude.ai/ I used AI to better understand how to interepret my findings before and after log transformation

# perform log transformation
donors_only_model$LOG_DONATION_AMT <- log(donors_only_model$DONATION_AMT)
# plot a histogram after log transformation
hist(donors_only_model$LOG_DONATION_AMT,
     main = "DONATION AMOUNT - AFTER LOG TRANSFORMATION",
     xlab = "Log(Donation Amount)",
     col = "darkorange")

# remove "DONATION_AMT"
donors_only_model$DONATION_AMT <- NULL

Split the data

set.seed(123)

# create a train index 
train_index <- createDataPartition(donors_only_model$LOG_DONATION_AMT,p = 0.8,list = FALSE)
# split into training and testing data
train_data <- donors_only_model[train_index, ]
test_data <- donors_only_model[-train_index, ]
# check the total observations
total_observations <- sum(nrow(train_data) + nrow(test_data))

# verify
nrow(train_data)
## [1] 3875
nrow(test_data)
## [1] 968
total_observations
## [1] 4843
prop.table(table(train_data$LOG_DONATION_AMT))
## 
## 4.60517018598809 5.29831736654804 5.52146091786225  5.7037824746562 
##     0.0018064516     0.0038709677     0.0002580645     0.0185806452 
## 5.99146454710798 6.21460809842219 6.26339826259162 6.39692965521615 
##     0.0157419355     0.1016774194     0.0002580645     0.0252903226 
##  6.5510803350434 6.62007320653036 6.68461172766793 6.80239476332431 
##     0.0260645161     0.0002580645     0.0224516129     0.0144516129 
## 6.90775527898214 6.97541392745595 7.00306545878646 7.09007683577609 
##     0.1974193548     0.0002580645     0.0255483871     0.0330322581 
## 7.13089883029635 7.17011954344963 7.23849684089437 7.24422751560335 
##     0.0020645161     0.0123870968     0.0002580645     0.0167741935 
##  7.3132203870903 7.37775890822787 7.43070708254597 7.43838353004431 
##     0.1223225806     0.0170322581     0.0002580645     0.0139354839 
## 7.49554194388426 7.50933526601659 7.54960916515453 7.60090245954208 
##     0.0072258065     0.0002580645     0.0061935484     0.1181935484 
## 7.64969262371151 7.69621263934641 7.74066440191724 7.78322401633604 
##     0.0203870968     0.0054193548     0.0082580645     0.0025806452 
## 7.82404601085629 7.86326672400957 7.90100705199242  7.9373746961633 
##     0.0792258065     0.0049032258     0.0023225806     0.0010322581 
## 7.97246601597457 8.00636756765025 8.03915739047324 8.07090608878782 
##     0.0005161290     0.0198709677     0.0005161290     0.0023225806 
## 8.10167774745457 8.13153071060425  8.1605182474775  8.1886891244442 
##     0.0005161290     0.0002580645     0.0080000000     0.0015483871 
## 8.21608809863232 8.24275634571448 8.29404964010203  8.3187422526924 
##     0.0015483871     0.0012903226     0.0064516129     0.0005161290 
## 8.34283980427146 8.36637030168165 8.38935981990635 8.41183267575841 
##     0.0002580645     0.0005161290     0.0002580645     0.0012903226 
## 8.43381158247719 8.45531778769815 8.47637119689598 8.51719319141624 
##     0.0005161290     0.0012903226     0.0002580645     0.0144516129 
## 8.53699581871242 8.57546209954021 8.61250337122056 8.69951474821019 
##     0.0005161290     0.0002580645     0.0002580645     0.0007741935 
##  8.9226582995244 9.15904707758863 9.21034037197618 9.22029070282935 
##     0.0015483871     0.0002580645     0.0049032258     0.0002580645 
## 9.23014299927236 9.61580548008435 9.90348755253613 
##     0.0002580645     0.0002580645     0.0005161290
prop.table(table(test_data$LOG_DONATION_AMT))
## 
## 5.29831736654804  5.7037824746562 5.99146454710798 6.10924758276437 
##      0.006198347      0.014462810      0.015495868      0.001033058 
## 6.21460809842219 6.39692965521615  6.5510803350434 6.68461172766793 
##      0.112603306      0.026859504      0.025826446      0.027892562 
## 6.80239476332431 6.90775527898214 7.00306545878646 7.09007683577609 
##      0.015495868      0.181818182      0.019628099      0.034090909 
## 7.13089883029635 7.17011954344963 7.24422751560335  7.3132203870903 
##      0.003099174      0.016528926      0.016528926      0.120867769 
## 7.37775890822787 7.43838353004431 7.46737106691756 7.49554194388426 
##      0.019628099      0.009297521      0.001033058      0.009297521 
## 7.54960916515453 7.60090245954208 7.64969262371151 7.69621263934641 
##      0.003099174      0.122933884      0.013429752      0.002066116 
## 7.74066440191724 7.78322401633604 7.82404601085629 7.86326672400957 
##      0.008264463      0.003099174      0.087809917      0.007231405 
##  7.9373746961633 8.00636756765025 8.03915739047324 8.07090608878782 
##      0.002066116      0.022727273      0.001033058      0.003099174 
## 8.10167774745457 8.13153071060425  8.1605182474775  8.1886891244442 
##      0.001033058      0.001033058      0.008264463      0.003099174 
## 8.24275634571448 8.29404964010203 8.34283980427146 8.38935981990635 
##      0.003099174      0.002066116      0.001033058      0.001033058 
## 8.39412119382624 8.41183267575841 8.51719319141624 8.53699581871242 
##      0.001033058      0.003099174      0.012396694      0.001033058 
## 8.57546209954021  8.9226582995244 9.21034037197618 9.90348755253613 
##      0.001033058      0.001033058      0.003099174      0.002066116

Build a multiple regression model

model <- lm(LOG_DONATION_AMT~., data = train_data)
summary(model)
## 
## Call:
## lm(formula = LOG_DONATION_AMT ~ ., data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9721  -0.2927   0.0525   0.3579   2.8500 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.178e+00  8.702e-02  70.995  < 2e-16 ***
## DONOR_AGE              -6.872e-04  6.990e-04  -0.983   0.3256    
## INCOME_LEVEL            3.010e-02  6.175e-03   4.875 1.13e-06 ***
## DONATION_RESPONSE      -8.224e-04  9.643e-04  -0.853   0.3938    
## MONTHS_SINCE_LAST_GIFT  2.715e-02  2.621e-03  10.360  < 2e-16 ***
## EMAILS_12               1.275e-02  1.888e-03   6.750 1.70e-11 ***
## LIFETIME_AVG_GIFT_AMT   2.504e-04  8.731e-06  28.674  < 2e-16 ***
## URBANICITYR            -5.910e-02  2.303e-02  -2.566   0.0103 *  
## DONOR_GENDERM           4.307e-02  1.849e-02   2.329   0.0199 *  
## HOME_OWNERH            -1.216e-02  1.985e-02  -0.612   0.5403    
## LIFETIME_EMAILS        -6.027e-03  1.184e-03  -5.088 3.78e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5562 on 3864 degrees of freedom
## Multiple R-squared:  0.2525, Adjusted R-squared:  0.2505 
## F-statistic: 130.5 on 10 and 3864 DF,  p-value: < 2.2e-16

Key findings

Based on the model;

DONOR_AGE, DONATION_RESPONSE are not significant. 

MONTHS_SINCE_LAST_GIFT is highly significant. May be human psychological behavior of guilt for not having donated for a while? Also, donors who give large donations are likely
to have time gaps between donations while donors who give small donations are likely to give frequent donations. 

INCOME_LEVEL  is highly significant which was expected. A person who does not have to worry about their finances is higly likly to give.

URBANICITYR is significant but not the most significant. It seems that donations are not always based on place of living.

LIFETIME_EMAILS is negatively highly significant. Constant communication is likly be monotonous for an individual while  EMAILS_12 is positively highly sognificant which could
mean that communication via email of only the campaign is likley to bring in new donors and encourage previous donors to donate. Constant emails causes "donor fatigue" but
targeted communications increases donations.

ACTION: build a model 2 with the most significant variables.

model2 <- lm(LOG_DONATION_AMT~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM + LIFETIME_EMAILS,
             data = train_data)
summary(model2)
## 
## Call:
## lm(formula = LOG_DONATION_AMT ~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + 
##     EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM + 
##     LIFETIME_EMAILS, data = train_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.9787  -0.2919   0.0491   0.3573   2.8143 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.131e+00  7.293e-02  84.065  < 2e-16 ***
## INCOME_LEVEL            2.978e-02  5.640e-03   5.280 1.36e-07 ***
## MONTHS_SINCE_LAST_GIFT  2.734e-02  2.611e-03  10.468  < 2e-16 ***
## EMAILS_12               1.268e-02  1.883e-03   6.736 1.87e-11 ***
## LIFETIME_AVG_GIFT_AMT   2.504e-04  8.730e-06  28.683  < 2e-16 ***
## URBANICITYR            -5.598e-02  2.288e-02  -2.446   0.0145 *  
## DONOR_GENDERM           4.109e-02  1.843e-02   2.230   0.0258 *  
## LIFETIME_EMAILS        -6.244e-03  1.170e-03  -5.338 9.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5562 on 3867 degrees of freedom
## Multiple R-squared:  0.252,  Adjusted R-squared:  0.2506 
## F-statistic: 186.1 on 7 and 3867 DF,  p-value: < 2.2e-16

Key findings

All variables are significant in model2.

Make predictions with the test data

predictions <- predict(model2, newdata = test_data)

# calculate the RMSE & MAE
residual <- test_data$LOG_DONATION_AMT - predictions
rmse <- sqrt(mean(residual^2))
mae <- mean(abs(residual))

# back transform for interpretation
predicted_funds <- exp(predictions)
actual_funds <- exp(test_data$LOG_DONATION_AMT)

# calculate the RMSE & MAE
rmse_funds <- sqrt(mean((actual_funds - predicted_funds)^2))
mae_funds <- mean(abs(actual_funds - predicted_funds))
rmse_funds
## [1] 1119.25
mae_funds
## [1] 594.4031

Key findings

On a log scale the model2 predicts RMSE: 0.52 and MAE: 0.4. When transformed back to dollars the model’s predictions are off by $594.4

but if it makes bigger mistakes, its off by $1119.25 which is almost twice as much. This means that the model makes larger errors for large donations which is

likely due to the unpredictability nature of large donations.

par(mfrow = c(2,2))
plot(model2)

Key findings

There seems to be a huge gap between donor 1286 and other donors.

Remove donor 1286 and create model3

new_train_data<-  train_data %>%
  filter(LIFETIME_AVG_GIFT_AMT < 45000)

model3 <- lm(LOG_DONATION_AMT~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM + LIFETIME_EMAILS,
             data = new_train_data)
summary(model3)
## 
## Call:
## lm(formula = LOG_DONATION_AMT ~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + 
##     EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM + 
##     LIFETIME_EMAILS, data = new_train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8184 -0.2617  0.0464  0.3129  2.7266 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.0269641  0.0660255  91.282  < 2e-16 ***
## INCOME_LEVEL            0.0221973  0.0051046   4.348 1.41e-05 ***
## MONTHS_SINCE_LAST_GIFT  0.0220999  0.0023672   9.336  < 2e-16 ***
## EMAILS_12               0.0069177  0.0017136   4.037 5.52e-05 ***
## LIFETIME_AVG_GIFT_AMT   0.0004589  0.0000106  43.271  < 2e-16 ***
## URBANICITYR            -0.0447049  0.0206907  -2.161   0.0308 *  
## DONOR_GENDERM           0.0298582  0.0166641   1.792   0.0732 .  
## LIFETIME_EMAILS        -0.0014502  0.0010699  -1.355   0.1754    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5028 on 3866 degrees of freedom
## Multiple R-squared:  0.3888, Adjusted R-squared:  0.3877 
## F-statistic: 351.3 on 7 and 3866 DF,  p-value: < 2.2e-16

Key findings

Interesting how one donor has led to an increase in R squared and adjusted r squared.

Also, that one donor made DONOR_GENDERM and LIFETIME_EMAILS significant and now they are not.

I will remove DONOR_GENDERM and LIFETIME_EMAILS

Drop LIFETIME_EMAILS variable

model4 <- lm(LOG_DONATION_AMT~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM,
             data = new_train_data)
summary(model4)
## 
## Call:
## lm(formula = LOG_DONATION_AMT ~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + 
##     EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM, 
##     data = new_train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.8579 -0.2612  0.0484  0.3170  2.7066 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.017e+00  6.566e-02  91.645  < 2e-16 ***
## INCOME_LEVEL            2.248e-02  5.101e-03   4.407 1.08e-05 ***
## MONTHS_SINCE_LAST_GIFT  2.169e-02  2.348e-03   9.237  < 2e-16 ***
## EMAILS_12               5.991e-03  1.572e-03   3.812  0.00014 ***
## LIFETIME_AVG_GIFT_AMT   4.627e-04  1.022e-05  45.298  < 2e-16 ***
## URBANICITYR            -4.637e-02  2.066e-02  -2.245  0.02483 *  
## DONOR_GENDERM           2.991e-02  1.667e-02   1.795  0.07277 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5028 on 3867 degrees of freedom
## Multiple R-squared:  0.3885, Adjusted R-squared:  0.3876 
## F-statistic: 409.5 on 6 and 3867 DF,  p-value: < 2.2e-16

Key findings

In model4, DONOR_GENDERM is not statistically significant because it has not met p <0.05. However, gennder differences are always stated in fundraising.

par(mfrow = c(2,2))
plot(model4)

#### Key findings

There seems to be another outlier donor 1457 just like donor 1286 who had a lifetime donation amount of $45000 which is significantly outside the donation range.

However, the model has improved following removal of donor 1286.

I could also remove donor 1457 but at the same time, i would reduce the predictive validity of this model and limit it to small donors which is not the reality where high value donors will always be present.

Large donors are influenced by factors such as wealth management strategies and possibility of a personal relationship with the said organization or specific cause and other factors that were not present in this data set.

Rather than removing donor 1457, i suggest a separate model for major donors.

Evaluate model4

predictions4 <- predict(model4, newdata = test_data)

# calculate the RMSE & MAE
residual4 <- test_data$LOG_DONATION_AMT - predictions4
rmse4 <- sqrt(mean(residual4^2))
mae4 <- mean(abs(residual4))
rmse4
## [1] 0.483537
mae4
## [1] 0.3680748
# back transform for interpretation
predicted_funds4 <- exp(predictions4)
actual_funds4 <- exp(test_data$LOG_DONATION_AMT)

# calculate the RMSE & MAE
rmse_funds4 <- sqrt(mean((actual_funds4 - predicted_funds4)^2))
mae_funds4 <- mean(abs(actual_funds4 - predicted_funds4))
rmse_funds4
## [1] 2028.789
mae_funds4
## [1] 623.351

Key findings

On a log scale the model4 predicts RMSE: 0.48 and MAE: 0.37.

When transformed back to dollars the model’s predictions are off by $623.35 but if it makes bigger mistakes, its off by $ 2028.79

Model4 outperforms model 2 indicating improvement however when back transformed to dollar values, RMSE incrases to $623.35 which is suggests that the model struggles with high donor predictions.

I would suggest a separate model for high value donors.

Summary of the models

Model - all variables

Model2 - only significant variables included

Model3 - outlier removed (donor 1286)

Model4 - backward elimination - final model

# Train the model on three equal datasets
num_models <- 3
subset_size <- nrow(new_train_data) # multiple (different but same size) random subsets of the data
reg_models<- list()

for (i in 1:num_models) {
  sample_indices <- sample(1:nrow(new_train_data),
                           size = subset_size,
                           replace = TRUE)
  subset <- new_train_data[sample_indices, ]

  # train logistic regression 
  reg_models[[i]] <- lm(LOG_DONATION_AMT~.,data = subset)
}
# get predictions from each model
predictions_1 <- predict(reg_models[[1]], newdata = test_data)
predictions_2 <- predict(reg_models[[2]], newdata = test_data)
predictions_3 <- predict(reg_models[[3]], newdata = test_data)

# get the average of the three predictions
ensemble_prediction <- (predictions_3 + predictions_2 + predictions_1) / 3

# calculate the RMSE & MAE
residual_ensemble <- test_data$LOG_DONATION_AMT - ensemble_prediction
rmse_ensemble <- sqrt(mean(residual_ensemble^2))
mae_ensemble <- mean(abs(residual_ensemble))
rmse_ensemble
## [1] 0.4810096
mae_ensemble
## [1] 0.36548
# back transform to real funds for interpretation
predicted_funds_ensemble <- exp(ensemble_prediction)
actual_funds_ensemble <- exp(test_data$LOG_DONATION_AMT)

rmse_funds_ensemble <- sqrt(mean((actual_funds_ensemble - predicted_funds_ensemble)^2))
mae_funds_ensemble <- mean(abs(actual_funds_ensemble - predicted_funds_ensemble))
rmse_funds_ensemble
## [1] 2429.004
mae_funds_ensemble
## [1] 642.7275

Key findings

On an ensemble model, the predictions are RMSE: 0.48 and MAE: 0.37 before log transformation and RMSE: 2429

and MAE: 642.73 after log transformation compared to to a single model (model4) with the predictions RMSE: 0.48 and MAE: 0.37 before log

transformation and RMSE: 2028.79 and MAE: 623.35 after log transformation.

There is slight but unsignificant change between the single model and the ensemble model. For linear regression algorithm, bagging has low effect due to low variance.

This model predicts typical donor behavior but is limited towards large donors.

save my work

save.image(file = "MuchukiM.DA5030.Practicum2-PartB.RData")