library('caret')
## Loading required package: ggplot2
## Loading required package: lattice
library('dplyr')
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library('ggplot2')
library('corrplot')
## corrplot 0.95 loaded
library('repr')
zip_url <- "https://s3.us-east-2.amazonaws.com/artificium.us/datasets/donations-salted.csv.zip"
zip_file <- tempfile(fileext = ".zip")
download.file(zip_url, zip_file, mode = 'wb')
df <- read.csv(unz(zip_file, "donations-salted.csv"), header = TRUE, sep = ",")
head(df)
str(df)
## 'data.frame': 19372 obs. of 20 variables:
## $ CONTROL_NUMBER : int 5 12 37 38 41 52 53 67 70 71 ...
## $ DONATED : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ DONATION_AMT : int NA 1000 NA NA NA NA NA 500 NA 1600 ...
## $ DONOR_AGE : int 87 79 75 NA 74 63 71 79 41 63 ...
## $ URBANICITY : chr "?" "R" "S" "U" ...
## $ DONOR_GENDER : chr "M" "M" "F" "F" ...
## $ HOME_OWNER : chr "H" "H" "H" "H" ...
## $ INCOME_LEVEL : int 2 7 5 6 2 3 5 1 4 4 ...
## $ SES : chr "." "45" "11" "4" ...
## $ MEDIAN_HOME_VALUE : num 554000 334000 2388000 1688000 514000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 8 0 0 0 0 0 ...
## $ MONTHS_SINCE_LAST_GIFT : int 26 7 6 6 18 19 21 9 17 9 ...
## $ EMAILS_12 : int 17 28 43 44 25 29 25 33 36 38 ...
## $ LIFETIME_GIFT_COUNT : int 35 25 36 37 12 22 20 20 34 22 ...
## $ LIFETIME_EMAILS : int 19 32 44 31 30 22 29 33 36 27 ...
## $ LIFETIME_GIFT_AMOUNT : num 29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 500 500 300 500 300 500 300 300 300 ...
## $ LIFETIME_AVG_GIFT_AMT : int 849 1472 1675 1176 883 582 1100 505 503 682 ...
summary(df)
## CONTROL_NUMBER DONATED DONATION_AMT DONOR_AGE
## Min. : 5 Mode :logical Min. : 100 Min. : 0.00
## 1st Qu.: 48289 FALSE:14529 1st Qu.: 1000 1st Qu.:47.00
## Median : 96937 TRUE :4843 Median : 1300 Median :60.00
## Mean : 96546 Mean : 1562 Mean :58.92
## 3rd Qu.:145430 3rd Qu.: 2000 3rd Qu.:73.00
## Max. :191779 Max. :20000 Max. :87.00
## NA's :14529 NA's :4795
## URBANICITY DONOR_GENDER HOME_OWNER INCOME_LEVEL
## Length:19372 Length:19372 Length:19372 Min. :1.000
## Class :character Class :character Class :character 1st Qu.:2.000
## Mode :character Mode :character Mode :character Median :4.000
## Mean :3.908
## 3rd Qu.:5.000
## Max. :7.000
## NA's :4392
## SES MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## Length:19372 Min. : 0 Min. : 0 Min. : 0.000
## Class :character 1st Qu.: 518000 1st Qu.: 232000 1st Qu.: 0.000
## Mode :character Median : 747000 Median : 311000 Median : 0.000
## Mean :1079872 Mean : 341970 Mean : 3.362
## 3rd Qu.:1227000 3rd Qu.: 417000 3rd Qu.: 3.000
## Max. :6000000 Max. :1500000 Max. :241.000
##
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT LIFETIME_EMAILS
## Min. : 4.00 Min. : 2.00 Min. : 1.00 Min. : 2.00
## 1st Qu.:16.00 1st Qu.:16.00 1st Qu.: 4.00 1st Qu.:11.00
## Median :18.00 Median :18.00 Median : 8.00 Median :18.00
## Mean :18.19 Mean :18.27 Mean : 9.98 Mean :18.67
## 3rd Qu.:20.00 3rd Qu.:19.00 3rd Qu.:14.00 3rd Qu.:26.00
## Max. :27.00 Max. :77.00 Max. :95.00 Max. :56.00
##
## LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT LIFETIME_MIN_GIFT_AMT
## Min. : 1500 Min. : 500 Min. : 0.0
## 1st Qu.: 4200 1st Qu.: 1200 1st Qu.: 300.0
## Median : 7900 Median : 1600 Median : 500.0
## Mean : 10443 Mean : 1921 Mean : 762.1
## 3rd Qu.: 13200 3rd Qu.: 2100 3rd Qu.: 1000.0
## Max. :377500 Max. :100000 Max. :45000.0
##
## LIFETIME_AVG_GIFT_AMT
## Min. : 136
## 1st Qu.: 800
## Median : 1120
## Mean : 1286
## 3rd Qu.: 1500
## Max. :45000
##
# check for NA values
colSums(is.na(df))
## CONTROL_NUMBER DONATED DONATION_AMT
## 0 0 14529
## DONOR_AGE URBANICITY DONOR_GENDER
## 4795 0 0
## HOME_OWNER INCOME_LEVEL SES
## 0 4392 0
## MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## 0 0 0
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT
## 0 0 0
## LIFETIME_EMAILS LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT
## 0 0 0
## LIFETIME_MIN_GIFT_AMT LIFETIME_AVG_GIFT_AMT
## 0 0
sum(is.na(df))
## [1] 23716
# check for empty strings
any(apply(df, 2, function(x) any(x == ""| x == " ")))
## [1] NA
# check for unknown entries in the data frame
colSums(df == "uknown"|df == "Uknown")
## CONTROL_NUMBER DONATED DONATION_AMT
## 0 0 NA
## DONOR_AGE URBANICITY DONOR_GENDER
## NA 0 0
## HOME_OWNER INCOME_LEVEL SES
## 0 NA 0
## MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## 0 0 0
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT
## 0 0 0
## LIFETIME_EMAILS LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT
## 0 0 0
## LIFETIME_MIN_GIFT_AMT LIFETIME_AVG_GIFT_AMT
## 0 0
Lets investigate columns : INCOME_LEVEL , DONOR_AGE
NOTE: the target variable has NA values.
There are no empty strings in the data frame.
There are no “Unknown” or “unknown” entries in the data frame. Yay!
# DONOR_AGE - the age of donors in years
unique(df$DONOR_AGE)
## [1] 87 79 75 NA 74 63 71 41 73 81 76 69 45 43 55 77 37 52 30 67 65 59 42 58 56
## [26] 80 82 64 48 85 47 70 40 32 53 86 54 33 60 49 7 17 57 78 50 39 66 51 72 38
## [51] 61 68 24 16 84 44 34 83 35 29 26 27 46 31 62 23 36 18 6 25 28 19 21 22 0
## [76] 2 12 20 15 4 14
sum(is.na(df$DONOR_AGE))
## [1] 4795
# lets look at the rows with NA values
df[is.na(df$DONOR_AGE), ]
table(df$DONOR_AGE)
##
## 0 2 4 6 7 12 14 15 16 17 18 19 20 21 22 23 24 25 26 27
## 2 3 1 10 74 1 1 2 11 101 2 5 1 13 9 23 10 46 35 124
## 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
## 37 79 55 111 65 137 80 202 117 282 112 244 160 261 179 278 170 343 178 358
## 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## 212 404 237 364 177 321 212 361 181 365 186 320 169 264 208 271 178 347 228 411
## 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87
## 231 343 225 350 239 395 295 380 281 381 251 326 215 276 192 239 140 174 144 162
summary(df$DONOR_AGE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 47.00 60.00 58.92 73.00 87.00 4795
# sum of donors who are 18 years and below
sum(df$DONOR_AGE < 18, na.rm = TRUE)
## [1] 206
# lets use a threshold of 18 and equate all donors <18 to NA for age
df$DONOR_AGE[df$DONOR_AGE <18] <- NA
# impute the NA with the median age
median_age <- median(df$DONOR_AGE, na.rm = TRUE)
df$DONOR_AGE[is.na(df$DONOR_AGE)] <- median_age
# verify the change
sum(is.na(df$DONOR_AGE))
## [1] 0
table(df$DONOR_AGE)
##
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33
## 2 5 1 13 9 23 10 46 35 124 37 79 55 111 65 137
## 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
## 80 202 117 282 112 244 160 261 179 278 170 343 178 358 212 404
## 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
## 237 364 177 321 212 361 181 365 186 320 5170 264 208 271 178 347
## 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81
## 228 411 231 343 225 350 239 395 295 380 281 381 251 326 215 276
## 82 83 84 85 86 87
## 192 239 140 174 144 162
sum(df$DONOR_AGE < 18,na.rm = TRUE)
## [1] 0
colSums(is.na(df))
## CONTROL_NUMBER DONATED DONATION_AMT
## 0 0 14529
## DONOR_AGE URBANICITY DONOR_GENDER
## 0 0 0
## HOME_OWNER INCOME_LEVEL SES
## 0 4392 0
## MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## 0 0 0
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT
## 0 0 0
## LIFETIME_EMAILS LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT
## 0 0 0
## LIFETIME_MIN_GIFT_AMT LIFETIME_AVG_GIFT_AMT
## 0 0
There are 4795 NA values in DONOR_AGE
There is a donor with 0 age which is likely a data entry error.
ACTION: I set a threshold of 18 and all DONOR-AGE below 18 were set to NA then imputed all NA values with the median age.
# INCOME_LEVEL: one of 7 income levels; higher means more income; exact bracket structure is unknown
unique(df$INCOME_LEVEL)
## [1] 2 7 5 6 3 1 4 NA
table(df$INCOME_LEVEL)
##
## 1 2 3 4 5 6 7
## 1822 2626 1699 2526 3149 1591 1567
sum(is.na(df$INCOME_LEVEL))
## [1] 4392
# check the rows that have na INCOME_LEVEL
df[is.na(df$INCOME_LEVEL), ]
# check what levsls 1- 7 mean using the donation amount col
tapply(df$DONATION_AMT, df$INCOME_LEVEL, mean, na.rm = TRUE)
## 1 2 3 4 5 6 7
## 1342.287 1378.962 1434.752 1528.672 1650.825 1746.520 1769.484
# imput the the NA values with the median
median_income <- median(df$INCOME_LEVEL, na.rm = TRUE)
df$INCOME_LEVEL[is.na(df$INCOME_LEVEL)] <- median_income
# verify the change
sum(is.na(df$INCOME_LEVEL))
## [1] 0
colSums(is.na(df))
## CONTROL_NUMBER DONATED DONATION_AMT
## 0 0 14529
## DONOR_AGE URBANICITY DONOR_GENDER
## 0 0 0
## HOME_OWNER INCOME_LEVEL SES
## 0 0 0
## MEDIAN_HOME_VALUE MEDIAN_HOUSEHOLD_INCOME DONATION_RESPONSE
## 0 0 0
## MONTHS_SINCE_LAST_GIFT EMAILS_12 LIFETIME_GIFT_COUNT
## 0 0 0
## LIFETIME_EMAILS LIFETIME_GIFT_AMOUNT LIFETIME_MAX_GIFT_AMT
## 0 0 0
## LIFETIME_MIN_GIFT_AMT LIFETIME_AVG_GIFT_AMT
## 0 0
There are 7 levels of income.
In the INCOME_LEVEL column there are 0 NA values
The rows with NA INCOME_LEVEL entries have no specific patttern.
For the seven income levels; 1 is the least with 277.0033 and 7 is the highest with 481.0466.
ACTION TAKEN:
Impute the NA values with the median of the INCOME_LEVEl column.
The INCOME_LEVEL column is ordinal and the differences between the levels of income differ hence the median would be the best to imput the NA values.
# plot the INCOME_LEVEL
ggplot(df, aes(x = INCOME_LEVEL)) +
geom_bar(fill = "steelblue") +
labs(title="Distribution of the levels of income") +
theme(axis.text = element_text(angle = 45, vjust = 0.5, hjust = 1))
#### Observation
There is an increase in income level 4 after imputing NA values with the median income.
# SES: socio-economic status for donor cluster; higher number means a higher socio-economic status
unique(df$SES)
## [1] "." "45" "11" "4" "49" "8" "50" "28" "30" "43" "53" "42" "46" "20" "16"
## [16] "40" "7" "34" "23" "35" "41" "25" "10" "1" "9" "2" "12" "14" "37" "36"
## [31] "15" "39" "38" "18" "48" "24" "3" "13" "31" "5" "27" "19" "51" "22" "17"
## [46] "26" "21" "44" "6" "29" "33" "47" "32" "52"
table(unique(df$SES))
##
## . 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27 28 29 3 30 31
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45 46 47 48 49 5 50 51 52 53 6 7
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 8 9
## 1 1
# compare SES col with DONATION AMT
tapply(df$DONATION_AMT, df$SES, mean, na.rm = TRUE)
## . 1 10 11 12 13 14 15
## 2043.007 2074.627 1556.522 1684.252 1638.125 2033.918 1675.984 1827.885
## 16 17 18 19 2 20 21 22
## 1351.923 1533.721 1549.702 1320.000 1565.152 1519.355 1836.824 1651.538
## 23 24 25 26 27 28 29 3
## 1523.239 1571.065 1432.812 1337.255 1519.880 1533.028 1920.000 1852.059
## 30 31 32 33 34 35 36 37
## 1209.174 1562.903 1507.407 1211.538 1632.353 1702.475 1357.838 1560.465
## 38 39 4 40 41 42 43 44
## 1450.769 1504.839 1500.000 1462.979 1420.833 1618.589 1388.462 1554.730
## 45 46 47 48 49 5 50 51
## 1318.019 1443.269 1241.176 1293.750 1465.541 1612.766 1662.857 1269.307
## 52 53 6 7 8 9
## 1076.923 1445.455 1727.273 1813.636 1495.000 1326.471
# find the mode of the SES column
mode_SES <- as.numeric(sort(table(df$SES), decreasing = TRUE)[1])
df$SES[df$SES == "."] <- mode_SES
# verify the changes
table(unique(df$SES))
##
## 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 23 24 25 26 27
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 28 29 3 30 31 32 33 34 35 36 37 38 39 4 40 41 42 43 44 45
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 46 47 48 49 5 50 51 52 53 6 7 8 830 9
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
df$SES <- as.numeric(df$SES)
The SES ranges from 1 to 830 -> Wide range!
There is no specific pattern between SES and DONATION_AMT
There is 1 “.” - This is not an appropriate data entry for the model.
# URBANICITY: the “urbanicity” of the donor’s place of residence:
# U = urban (i.e., lives in a (large) city),
# S = suburban (outside but near the city),
# T = town (lives in a “small” town),
# R = rural (e.g., farm, away from town) ,
# ? = unknown
unique(df$URBANICITY)
## [1] "?" "R" "S" "U" "C" "T"
table(df$URBANICITY, useNA = "always")
##
## ? C R S T U <NA>
## 454 4022 4005 4491 3944 2456 0
sum(df$URBANICITY == "?")
## [1] 454
sum(df$URBANICITY == "C")
## [1] 4022
# The metadata does not have "C"
# I will combine the '?' and 'C' as NA values
df$URBANICITY[df$URBANICITY == "C"] <- NA
df$URBANICITY[df$URBANICITY == "?"] <- NA
# verify the change
sum(df$URBANICITY == "?")
## [1] NA
sum(df$URBANICITY == "C")
## [1] NA
# impute the NA with the mode
mode_URBANICITY <- names(sort(table(df$URBANICITY), decreasing = TRUE)[1])
df$URBANICITY[is.na(df$URBANICITY)] <- mode_URBANICITY
# verify the change
table(df$URBANICITY, useNA = "always")
##
## R S T U <NA>
## 4005 8967 3944 2456 0
unique(df$URBANICITY)
## [1] "S" "R" "U" "T"
table(is.na(df$URBANICITY))
##
## FALSE
## 19372
# impute the NA entries to numerical values
dummies <- dummyVars(~URBANICITY, data = df)
URBANICITY_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
df %>% select(-URBANICITY), URBANICITY_encoded
)
# verify for any NA
sum(is.na(df$URBANICITYR))
## [1] 0
sum(is.na(df$URBANICITYS))
## [1] 0
sum(is.na(df$URBANICITYT))
## [1] 0
sum(is.na(df$URBANICITYU))
## [1] 0
There is “?” which could be a data entry error and can be NA
The metadata does not have “C” but the data set has “C”
Combined both “C” and “?” as one column and changed to NA
Imputed the NA values with the mode of URBANICUTY
Used dummy coding for the column URBANICITY for the model.
Checked to verify the changes.
# lets explore the binary columns - DONATED , DONOR_GENDER, HOME_OWNER
table(unique(df$DONOR_GENDER))
##
## F M U
## 1 1 1
table(unique(df$HOME_OWNER))
##
## H N U
## 1 1 1
# DONOR-GENEDER is nominal categorical
# use one hot encoding
dummies <- dummyVars(~DONOR_GENDER, data = df)
DONOR_GENDER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
df %>% select(-DONOR_GENDER), DONOR_GENDER_encoded
)
# verify the changes
colnames(df)
## [1] "CONTROL_NUMBER" "DONATED"
## [3] "DONATION_AMT" "DONOR_AGE"
## [5] "HOME_OWNER" "INCOME_LEVEL"
## [7] "SES" "MEDIAN_HOME_VALUE"
## [9] "MEDIAN_HOUSEHOLD_INCOME" "DONATION_RESPONSE"
## [11] "MONTHS_SINCE_LAST_GIFT" "EMAILS_12"
## [13] "LIFETIME_GIFT_COUNT" "LIFETIME_EMAILS"
## [15] "LIFETIME_GIFT_AMOUNT" "LIFETIME_MAX_GIFT_AMT"
## [17] "LIFETIME_MIN_GIFT_AMT" "LIFETIME_AVG_GIFT_AMT"
## [19] "URBANICITYR" "URBANICITYS"
## [21] "URBANICITYT" "URBANICITYU"
## [23] "DONOR_GENDERF" "DONOR_GENDERM"
## [25] "DONOR_GENDERU"
There is “U” - unknown for gender. This could be legitimate data. Probably the gender was not recorded during data entry
It is a nominal categorical- I’ll use one hot encoding for the model and verify the changes.
# HOME_OWNER: whether donor is a home owner or not: H = yes, U = unknown, N = no
# use one-hot-encoding
dummies <- dummyVars(~HOME_OWNER, data = df)
HOME_OWNER_encoded <- predict(dummies, df) %>% as.data.frame()
df <- bind_cols(
df%>% select(-HOME_OWNER), HOME_OWNER_encoded
)
# verify the changes
colnames(df)
## [1] "CONTROL_NUMBER" "DONATED"
## [3] "DONATION_AMT" "DONOR_AGE"
## [5] "INCOME_LEVEL" "SES"
## [7] "MEDIAN_HOME_VALUE" "MEDIAN_HOUSEHOLD_INCOME"
## [9] "DONATION_RESPONSE" "MONTHS_SINCE_LAST_GIFT"
## [11] "EMAILS_12" "LIFETIME_GIFT_COUNT"
## [13] "LIFETIME_EMAILS" "LIFETIME_GIFT_AMOUNT"
## [15] "LIFETIME_MAX_GIFT_AMT" "LIFETIME_MIN_GIFT_AMT"
## [17] "LIFETIME_AVG_GIFT_AMT" "URBANICITYR"
## [19] "URBANICITYS" "URBANICITYT"
## [21] "URBANICITYU" "DONOR_GENDERF"
## [23] "DONOR_GENDERM" "DONOR_GENDERU"
## [25] "HOME_OWNERH" "HOME_OWNERN"
## [27] "HOME_OWNERU"
HOME_OWNER is a nominal categorical column. I’ll use one hot encoding for the model and verify the changes.
# check the MEDIAN_HOME_VALUE
table(df$MEDIAN_HOME_VALUE[df$MEDIAN_HOME_VALUE == 0])
##
## 0
## 218
There are 218 count of zero median home value.
Is this possible? turns out -> a median home price of $0 indicates that 50% or more of the analyzed transactions in a particular, small dataset involved a $0 exchange of value.
So this is not a data entry error as i had previously thought in Part A.
str(df)
## 'data.frame': 19372 obs. of 27 variables:
## $ CONTROL_NUMBER : int 5 12 37 38 41 52 53 67 70 71 ...
## $ DONATED : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ DONATION_AMT : int NA 1000 NA NA NA NA NA 500 NA 1600 ...
## $ DONOR_AGE : int 87 79 75 60 74 63 71 79 41 63 ...
## $ INCOME_LEVEL : num 2 7 5 6 2 3 5 1 4 4 ...
## $ SES : num 830 45 11 4 49 8 50 28 30 43 ...
## $ MEDIAN_HOME_VALUE : num 554000 334000 2388000 1688000 514000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 8 0 0 0 0 0 ...
## $ MONTHS_SINCE_LAST_GIFT : int 26 7 6 6 18 19 21 9 17 9 ...
## $ EMAILS_12 : int 17 28 43 44 25 29 25 33 36 38 ...
## $ LIFETIME_GIFT_COUNT : int 35 25 36 37 12 22 20 20 34 22 ...
## $ LIFETIME_EMAILS : int 19 32 44 31 30 22 29 33 36 27 ...
## $ LIFETIME_GIFT_AMOUNT : num 29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 500 500 300 500 300 500 300 300 300 ...
## $ LIFETIME_AVG_GIFT_AMT : int 849 1472 1675 1176 883 582 1100 505 503 682 ...
## $ URBANICITYR : num 0 1 0 0 1 0 1 0 0 1 ...
## $ URBANICITYS : num 1 0 1 0 0 0 0 1 1 0 ...
## $ URBANICITYT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ URBANICITYU : num 0 0 0 1 0 1 0 0 0 0 ...
## $ DONOR_GENDERF : num 0 0 1 1 1 0 0 1 1 1 ...
## $ DONOR_GENDERM : num 1 1 0 0 0 1 1 0 0 0 ...
## $ DONOR_GENDERU : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HOME_OWNERH : num 1 1 1 1 0 0 1 1 1 1 ...
## $ HOME_OWNERN : num 0 0 0 0 0 0 0 0 0 0 ...
## $ HOME_OWNERU : num 0 0 0 0 1 1 0 0 0 0 ...
df$URBANICITYU <- NULL
df$DONOR_GENDERU <- NULL
df$HOME_OWNERU <- NULL
# Remove columns that do not benefit the model
df$CONTROL_NUMBER <- NULL
# Verify the changes
str(df)
## 'data.frame': 19372 obs. of 23 variables:
## $ DONATED : logi FALSE TRUE FALSE FALSE FALSE FALSE ...
## $ DONATION_AMT : int NA 1000 NA NA NA NA NA 500 NA 1600 ...
## $ DONOR_AGE : int 87 79 75 60 74 63 71 79 41 63 ...
## $ INCOME_LEVEL : num 2 7 5 6 2 3 5 1 4 4 ...
## $ SES : num 830 45 11 4 49 8 50 28 30 43 ...
## $ MEDIAN_HOME_VALUE : num 554000 334000 2388000 1688000 514000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 294000 212000 405000 153000 328000 182000 122000 189000 180000 307000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 8 0 0 0 0 0 ...
## $ MONTHS_SINCE_LAST_GIFT : int 26 7 6 6 18 19 21 9 17 9 ...
## $ EMAILS_12 : int 17 28 43 44 25 29 25 33 36 38 ...
## $ LIFETIME_GIFT_COUNT : int 35 25 36 37 12 22 20 20 34 22 ...
## $ LIFETIME_EMAILS : int 19 32 44 31 30 22 29 33 36 27 ...
## $ LIFETIME_GIFT_AMOUNT : num 29700 36800 60300 43500 10600 12800 22000 10100 17100 15000 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2000 2500 2800 1700 2500 1000 2000 700 700 1100 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 500 500 300 500 300 500 300 300 300 ...
## $ LIFETIME_AVG_GIFT_AMT : int 849 1472 1675 1176 883 582 1100 505 503 682 ...
## $ URBANICITYR : num 0 1 0 0 1 0 1 0 0 1 ...
## $ URBANICITYS : num 1 0 1 0 0 0 0 1 1 0 ...
## $ URBANICITYT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ DONOR_GENDERF : num 0 0 1 1 1 0 0 1 1 1 ...
## $ DONOR_GENDERM : num 1 1 0 0 0 1 1 0 0 0 ...
## $ HOME_OWNERH : num 1 1 1 1 0 0 1 1 1 1 ...
## $ HOME_OWNERN : num 0 0 0 0 0 0 0 0 0 0 ...
I notice that there are NA values in DONATION_AMT
This is the target variable so let me explore that before visualization
table(df$DONATION_AMT)
##
## 100 200 250 300 400 450 500 525 600 700 750 800 900
## 7 21 1 86 76 1 503 1 124 126 1 114 71
## 1000 1070 1100 1200 1250 1300 1392 1400 1500 1600 1687 1700 1750
## 941 1 118 161 11 64 1 81 591 85 1 63 1
## 1800 1825 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900
## 37 1 27 577 92 23 40 13 392 26 9 6 2
## 3000 3100 3200 3300 3400 3500 3600 3700 3800 4000 4100 4200 4300
## 99 3 12 3 2 39 9 6 8 27 2 2 2
## 4400 4421 4500 4600 4700 4800 5000 5100 5300 5500 6000 7500 9500
## 2 1 8 2 5 1 68 3 2 1 3 7 1
## 10000 10100 10200 15000 20000
## 22 1 1 1 4
table(df$DONATED)
##
## FALSE TRUE
## 14529 4843
The target variable has NA values.
The DONATED col has 14,529 people who didnt donate and 4,843 of people who donated.
Hence the NA values in the DONATION_AMT column are the 14,529 people who didnt donate.
The goal of this model is to predict the target feature “DONATION_AMOUNT” for those who donate.
ACTION: Filter the data to only those who donated inorder to make predictions of the amount.
# set DONATED to null
df$DONATED <- NULL
# Filter data to donors only
donors_only <- df %>% filter(DONATION_AMT >0)
# verify the changes
table(donors_only$DONATION_AMT)
##
## 100 200 250 300 400 450 500 525 600 700 750 800 900
## 7 21 1 86 76 1 503 1 124 126 1 114 71
## 1000 1070 1100 1200 1250 1300 1392 1400 1500 1600 1687 1700 1750
## 941 1 118 161 11 64 1 81 591 85 1 63 1
## 1800 1825 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900
## 37 1 27 577 92 23 40 13 392 26 9 6 2
## 3000 3100 3200 3300 3400 3500 3600 3700 3800 4000 4100 4200 4300
## 99 3 12 3 2 39 9 6 8 27 2 2 2
## 4400 4421 4500 4600 4700 4800 5000 5100 5300 5500 6000 7500 9500
## 2 1 8 2 5 1 68 3 2 1 3 7 1
## 10000 10100 10200 15000 20000
## 22 1 1 1 4
str(donors_only)
## 'data.frame': 4843 obs. of 22 variables:
## $ DONATION_AMT : int 1000 500 1600 300 1200 800 1500 1000 3300 2000 ...
## $ DONOR_AGE : int 79 79 63 60 81 45 87 77 37 30 ...
## $ INCOME_LEVEL : num 7 1 4 4 4 2 1 4 5 6 ...
## $ SES : num 45 28 43 43 45 30 20 34 23 35 ...
## $ MEDIAN_HOME_VALUE : num 334000 1004000 399000 475000 530000 ...
## $ MEDIAN_HOUSEHOLD_INCOME: num 212000 189000 307000 227000 236000 387000 222000 331000 327000 544000 ...
## $ DONATION_RESPONSE : int 0 0 0 0 24 4 6 0 7 11 ...
## $ MONTHS_SINCE_LAST_GIFT : int 7 9 9 22 6 5 17 5 8 20 ...
## $ EMAILS_12 : int 28 33 38 31 40 34 34 40 38 19 ...
## $ LIFETIME_GIFT_COUNT : int 25 20 22 31 29 22 24 16 21 9 ...
## $ LIFETIME_EMAILS : int 32 33 27 44 33 29 39 33 40 13 ...
## $ LIFETIME_GIFT_AMOUNT : num 36800 10100 15000 18600 27000 12200 26100 18600 47000 7600 ...
## $ LIFETIME_MAX_GIFT_AMT : num 2500 700 1100 1100 1500 800 1500 1700 3500 2000 ...
## $ LIFETIME_MIN_GIFT_AMT : int 500 300 300 200 500 300 300 500 1000 500 ...
## $ LIFETIME_AVG_GIFT_AMT : int 1472 505 682 600 931 555 1088 1163 2238 844 ...
## $ URBANICITYR : num 1 0 1 1 1 0 0 0 0 0 ...
## $ URBANICITYS : num 0 1 0 0 0 1 1 0 1 0 ...
## $ URBANICITYT : num 0 0 0 0 0 0 0 1 0 1 ...
## $ DONOR_GENDERF : num 0 1 1 1 0 0 1 0 1 0 ...
## $ DONOR_GENDERM : num 1 0 0 0 1 1 0 0 0 1 ...
## $ HOME_OWNERH : num 1 1 1 0 0 0 0 0 0 1 ...
## $ HOME_OWNERN : num 0 0 0 0 0 1 0 0 0 0 ...
working with 4843 observations(people who donated only) and 22 variables
# distribution of target variable
ggplot(donors_only, aes(x = DONATION_AMT)) +
geom_bar(fill = "steelblue") +
labs(title="Distribution of donation amounts") +
theme(axis.text = element_text(angle = 45, vjust = 0.5, hjust = 1))
#### Key findings
The data is right skewed which in reality it is expected as in donation campaigns, there are more small contributions with very few large or high contributions.
The bar plot shows most donors giving between 1 and 5000 amounts with very few donors giving between 5001 and 10,000.
To create the model i will use log transformation to compress the scale.
First I’ll create a correlation plot of numerical variables.
# make the plot larger so the numbers are readable
options(repr.plot.width = 15, repr.plot.height = 15)
numeric_cols <- donors_only %>%
select(where(is.numeric))
# correlation plot
par(mar = c(2,2,2,2)) # expand margins because the plot is smalL
# visualize the corr plot
corrplot(cor(numeric_cols),
method = 'color',
type = 'upper',
addCoef.col = 'black',
tl.srt = 45,
number.cex = 0.7,
tl.cex = 0.6
)
#### Key findings
STRONG PREDICTORS
LIFETIME_AVG_GIFT_AMT, LIFETIME_MAX_GIFT_AMT -> Individuals who have a history of giving are likely to give in the future.
LIFETIME_EMAILS - a person is likely to give when they receive an email about the campaign
LIFETIME_GIFT_COUNT -> The more a person has given in the past, the more they are likely to give in the future.
There is multicollinearity between:
LIFETIME_GIFT_COUNT and LIFETIME_EMAILS
DONOR_GENDERF and DONOR_GENDERM
MEDIAN_HOME_VALUE and MEDIAN_HOUSEHOLD_INCOME
cor_matrix <- cor(numeric_cols, use = "complete.obs")
donation_cors <- sort(cor_matrix["DONATION_AMT",], decreasing = TRUE)
print(round(donation_cors, 3))
## DONATION_AMT LIFETIME_AVG_GIFT_AMT LIFETIME_MAX_GIFT_AMT
## 1.000 0.517 0.443
## LIFETIME_MIN_GIFT_AMT LIFETIME_GIFT_AMOUNT MEDIAN_HOME_VALUE
## 0.296 0.248 0.126
## MEDIAN_HOUSEHOLD_INCOME INCOME_LEVEL MONTHS_SINCE_LAST_GIFT
## 0.118 0.105 0.090
## SES EMAILS_12 URBANICITYS
## 0.058 0.055 0.042
## DONOR_GENDERM HOME_OWNERH DONATION_RESPONSE
## 0.038 0.013 0.013
## URBANICITYT DONOR_GENDERF HOME_OWNERN
## -0.022 -0.040 -0.044
## DONOR_AGE URBANICITYR LIFETIME_EMAILS
## -0.045 -0.056 -0.098
## LIFETIME_GIFT_COUNT
## -0.220
STRONG PREDICTORS:
LIFETIME_AVG_GIFT_AMT - 0.517
LIFETIME_MAX_GIFT_AMT - 0.443
LIFETIME_MIN_GIFT_AMT - 0.296
LIFETIME_GIFT_AMOUNT - 0.248
WEAK PREDICTORS:
MEDIAN_HOME_VALUE
MEDIAN_HOUSEHOLD_INCOME
INCOME_LEVEL
MONTHS_SINCE_LAST_GIFT
SES
EMAILS_12
NEGATIVE PREDICTORS:
URBANICITY
GENDER
HOME OWNER
LIFETIME_GIFT_EMAILS
LIFETIME_GIFT_COUNT
LIFETIME_AVG_GIFT_AMT,LIFETIME_MAX_GIFT_AMT, LIFETIME_MIN_GIFT_AMT, LIFETIME_GIFT_AMOUNT all seem to predict the donation amount so i will choose LIFETIME_AVG_GIFT_AMT instead of having all four.
For the weak predictors i will keep:
MEDIAN_HOME_VALUE - beacsue a person who is able to afford a good home is likley able to donate. Hifgher socio-economic status individuals are likely to make a higher donation compared to lower socio-economic class individuals
INCOME_LEVEL - higher income level equals to higher SES which equals to higher MEDIAN_HOME_VALUE
NB: MEDIAN_HOME_VALUE, INCOME_LEVEL, SES, & MEDIAN_HOUSEHOLD_INCOME all seem to form a chain where one affects the other other so ill choose one instead of having the four.
MONTHS_SINCE_LAST_GIFT - if last donation was more current, the individuals are liklly to donate but ammount can be varying.
ACTION:
I'll drop MEDIAN_HOME_VALUE, MEDIAN_HOUSEHOLD_INCOME, SES, LIFETIME_MAX_GIFT_AMT, LIFETIME_MIN_GIFT_AMT, LIFETIME_GIFT_AMOUNT for the model.
For URBANICITY DUMMIES, HOMEOWNER and DONOR_GENDER, I'll use one as a reference foe the others not listed. For example DONOR_GENDERM is listed and any unidivual who us not male is either female or gender not listed(UNKNOWN)
donors_only_model <- donors_only %>%
select("DONATION_AMT",
"DONOR_AGE",
"INCOME_LEVEL",
"DONATION_RESPONSE",
"MONTHS_SINCE_LAST_GIFT",
"EMAILS_12",
"LIFETIME_AVG_GIFT_AMT",
"URBANICITYR",
"DONOR_GENDERM",
"HOME_OWNERH",
"LIFETIME_EMAILS"
)
str(donors_only_model)
## 'data.frame': 4843 obs. of 11 variables:
## $ DONATION_AMT : int 1000 500 1600 300 1200 800 1500 1000 3300 2000 ...
## $ DONOR_AGE : int 79 79 63 60 81 45 87 77 37 30 ...
## $ INCOME_LEVEL : num 7 1 4 4 4 2 1 4 5 6 ...
## $ DONATION_RESPONSE : int 0 0 0 0 24 4 6 0 7 11 ...
## $ MONTHS_SINCE_LAST_GIFT: int 7 9 9 22 6 5 17 5 8 20 ...
## $ EMAILS_12 : int 28 33 38 31 40 34 34 40 38 19 ...
## $ LIFETIME_AVG_GIFT_AMT : int 1472 505 682 600 931 555 1088 1163 2238 844 ...
## $ URBANICITYR : num 1 0 1 1 1 0 0 0 0 0 ...
## $ DONOR_GENDERM : num 1 0 0 0 1 1 0 0 0 1 ...
## $ HOME_OWNERH : num 1 1 1 0 0 0 0 0 0 1 ...
## $ LIFETIME_EMAILS : int 32 33 27 44 33 29 39 33 40 13 ...
The data is right skewed, the best solution would be log transformation
Anthropic. (2025). Claude. Claude.ai. https://claude.ai/ I used AI to better understand how to interepret my findings before and after log transformation
# perform log transformation
donors_only_model$LOG_DONATION_AMT <- log(donors_only_model$DONATION_AMT)
# plot a histogram after log transformation
hist(donors_only_model$LOG_DONATION_AMT,
main = "DONATION AMOUNT - AFTER LOG TRANSFORMATION",
xlab = "Log(Donation Amount)",
col = "darkorange")
# remove "DONATION_AMT"
donors_only_model$DONATION_AMT <- NULL
set.seed(123)
# create a train index
train_index <- createDataPartition(donors_only_model$LOG_DONATION_AMT,p = 0.8,list = FALSE)
# split into training and testing data
train_data <- donors_only_model[train_index, ]
test_data <- donors_only_model[-train_index, ]
# check the total observations
total_observations <- sum(nrow(train_data) + nrow(test_data))
# verify
nrow(train_data)
## [1] 3875
nrow(test_data)
## [1] 968
total_observations
## [1] 4843
prop.table(table(train_data$LOG_DONATION_AMT))
##
## 4.60517018598809 5.29831736654804 5.52146091786225 5.7037824746562
## 0.0018064516 0.0038709677 0.0002580645 0.0185806452
## 5.99146454710798 6.21460809842219 6.26339826259162 6.39692965521615
## 0.0157419355 0.1016774194 0.0002580645 0.0252903226
## 6.5510803350434 6.62007320653036 6.68461172766793 6.80239476332431
## 0.0260645161 0.0002580645 0.0224516129 0.0144516129
## 6.90775527898214 6.97541392745595 7.00306545878646 7.09007683577609
## 0.1974193548 0.0002580645 0.0255483871 0.0330322581
## 7.13089883029635 7.17011954344963 7.23849684089437 7.24422751560335
## 0.0020645161 0.0123870968 0.0002580645 0.0167741935
## 7.3132203870903 7.37775890822787 7.43070708254597 7.43838353004431
## 0.1223225806 0.0170322581 0.0002580645 0.0139354839
## 7.49554194388426 7.50933526601659 7.54960916515453 7.60090245954208
## 0.0072258065 0.0002580645 0.0061935484 0.1181935484
## 7.64969262371151 7.69621263934641 7.74066440191724 7.78322401633604
## 0.0203870968 0.0054193548 0.0082580645 0.0025806452
## 7.82404601085629 7.86326672400957 7.90100705199242 7.9373746961633
## 0.0792258065 0.0049032258 0.0023225806 0.0010322581
## 7.97246601597457 8.00636756765025 8.03915739047324 8.07090608878782
## 0.0005161290 0.0198709677 0.0005161290 0.0023225806
## 8.10167774745457 8.13153071060425 8.1605182474775 8.1886891244442
## 0.0005161290 0.0002580645 0.0080000000 0.0015483871
## 8.21608809863232 8.24275634571448 8.29404964010203 8.3187422526924
## 0.0015483871 0.0012903226 0.0064516129 0.0005161290
## 8.34283980427146 8.36637030168165 8.38935981990635 8.41183267575841
## 0.0002580645 0.0005161290 0.0002580645 0.0012903226
## 8.43381158247719 8.45531778769815 8.47637119689598 8.51719319141624
## 0.0005161290 0.0012903226 0.0002580645 0.0144516129
## 8.53699581871242 8.57546209954021 8.61250337122056 8.69951474821019
## 0.0005161290 0.0002580645 0.0002580645 0.0007741935
## 8.9226582995244 9.15904707758863 9.21034037197618 9.22029070282935
## 0.0015483871 0.0002580645 0.0049032258 0.0002580645
## 9.23014299927236 9.61580548008435 9.90348755253613
## 0.0002580645 0.0002580645 0.0005161290
prop.table(table(test_data$LOG_DONATION_AMT))
##
## 5.29831736654804 5.7037824746562 5.99146454710798 6.10924758276437
## 0.006198347 0.014462810 0.015495868 0.001033058
## 6.21460809842219 6.39692965521615 6.5510803350434 6.68461172766793
## 0.112603306 0.026859504 0.025826446 0.027892562
## 6.80239476332431 6.90775527898214 7.00306545878646 7.09007683577609
## 0.015495868 0.181818182 0.019628099 0.034090909
## 7.13089883029635 7.17011954344963 7.24422751560335 7.3132203870903
## 0.003099174 0.016528926 0.016528926 0.120867769
## 7.37775890822787 7.43838353004431 7.46737106691756 7.49554194388426
## 0.019628099 0.009297521 0.001033058 0.009297521
## 7.54960916515453 7.60090245954208 7.64969262371151 7.69621263934641
## 0.003099174 0.122933884 0.013429752 0.002066116
## 7.74066440191724 7.78322401633604 7.82404601085629 7.86326672400957
## 0.008264463 0.003099174 0.087809917 0.007231405
## 7.9373746961633 8.00636756765025 8.03915739047324 8.07090608878782
## 0.002066116 0.022727273 0.001033058 0.003099174
## 8.10167774745457 8.13153071060425 8.1605182474775 8.1886891244442
## 0.001033058 0.001033058 0.008264463 0.003099174
## 8.24275634571448 8.29404964010203 8.34283980427146 8.38935981990635
## 0.003099174 0.002066116 0.001033058 0.001033058
## 8.39412119382624 8.41183267575841 8.51719319141624 8.53699581871242
## 0.001033058 0.003099174 0.012396694 0.001033058
## 8.57546209954021 8.9226582995244 9.21034037197618 9.90348755253613
## 0.001033058 0.001033058 0.003099174 0.002066116
model <- lm(LOG_DONATION_AMT~., data = train_data)
summary(model)
##
## Call:
## lm(formula = LOG_DONATION_AMT ~ ., data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9721 -0.2927 0.0525 0.3579 2.8500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.178e+00 8.702e-02 70.995 < 2e-16 ***
## DONOR_AGE -6.872e-04 6.990e-04 -0.983 0.3256
## INCOME_LEVEL 3.010e-02 6.175e-03 4.875 1.13e-06 ***
## DONATION_RESPONSE -8.224e-04 9.643e-04 -0.853 0.3938
## MONTHS_SINCE_LAST_GIFT 2.715e-02 2.621e-03 10.360 < 2e-16 ***
## EMAILS_12 1.275e-02 1.888e-03 6.750 1.70e-11 ***
## LIFETIME_AVG_GIFT_AMT 2.504e-04 8.731e-06 28.674 < 2e-16 ***
## URBANICITYR -5.910e-02 2.303e-02 -2.566 0.0103 *
## DONOR_GENDERM 4.307e-02 1.849e-02 2.329 0.0199 *
## HOME_OWNERH -1.216e-02 1.985e-02 -0.612 0.5403
## LIFETIME_EMAILS -6.027e-03 1.184e-03 -5.088 3.78e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5562 on 3864 degrees of freedom
## Multiple R-squared: 0.2525, Adjusted R-squared: 0.2505
## F-statistic: 130.5 on 10 and 3864 DF, p-value: < 2.2e-16
Based on the model;
DONOR_AGE, DONATION_RESPONSE are not significant.
MONTHS_SINCE_LAST_GIFT is highly significant. May be human psychological behavior of guilt for not having donated for a while? Also, donors who give large donations are likely
to have time gaps between donations while donors who give small donations are likely to give frequent donations.
INCOME_LEVEL is highly significant which was expected. A person who does not have to worry about their finances is higly likly to give.
URBANICITYR is significant but not the most significant. It seems that donations are not always based on place of living.
LIFETIME_EMAILS is negatively highly significant. Constant communication is likly be monotonous for an individual while EMAILS_12 is positively highly sognificant which could
mean that communication via email of only the campaign is likley to bring in new donors and encourage previous donors to donate. Constant emails causes "donor fatigue" but
targeted communications increases donations.
ACTION: build a model 2 with the most significant variables.
model2 <- lm(LOG_DONATION_AMT~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM + LIFETIME_EMAILS,
data = train_data)
summary(model2)
##
## Call:
## lm(formula = LOG_DONATION_AMT ~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT +
## EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM +
## LIFETIME_EMAILS, data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.9787 -0.2919 0.0491 0.3573 2.8143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.131e+00 7.293e-02 84.065 < 2e-16 ***
## INCOME_LEVEL 2.978e-02 5.640e-03 5.280 1.36e-07 ***
## MONTHS_SINCE_LAST_GIFT 2.734e-02 2.611e-03 10.468 < 2e-16 ***
## EMAILS_12 1.268e-02 1.883e-03 6.736 1.87e-11 ***
## LIFETIME_AVG_GIFT_AMT 2.504e-04 8.730e-06 28.683 < 2e-16 ***
## URBANICITYR -5.598e-02 2.288e-02 -2.446 0.0145 *
## DONOR_GENDERM 4.109e-02 1.843e-02 2.230 0.0258 *
## LIFETIME_EMAILS -6.244e-03 1.170e-03 -5.338 9.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5562 on 3867 degrees of freedom
## Multiple R-squared: 0.252, Adjusted R-squared: 0.2506
## F-statistic: 186.1 on 7 and 3867 DF, p-value: < 2.2e-16
All variables are significant in model2.
predictions <- predict(model2, newdata = test_data)
# calculate the RMSE & MAE
residual <- test_data$LOG_DONATION_AMT - predictions
rmse <- sqrt(mean(residual^2))
mae <- mean(abs(residual))
# back transform for interpretation
predicted_funds <- exp(predictions)
actual_funds <- exp(test_data$LOG_DONATION_AMT)
# calculate the RMSE & MAE
rmse_funds <- sqrt(mean((actual_funds - predicted_funds)^2))
mae_funds <- mean(abs(actual_funds - predicted_funds))
rmse_funds
## [1] 1119.25
mae_funds
## [1] 594.4031
On a log scale the model2 predicts RMSE: 0.52 and MAE: 0.4. When transformed back to dollars the model’s predictions are off by $594.4
but if it makes bigger mistakes, its off by $1119.25 which is almost twice as much. This means that the model makes larger errors for large donations which is
likely due to the unpredictability nature of large donations.
par(mfrow = c(2,2))
plot(model2)
There seems to be a huge gap between donor 1286 and other donors.
Remove donor 1286 and create model3
new_train_data<- train_data %>%
filter(LIFETIME_AVG_GIFT_AMT < 45000)
model3 <- lm(LOG_DONATION_AMT~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM + LIFETIME_EMAILS,
data = new_train_data)
summary(model3)
##
## Call:
## lm(formula = LOG_DONATION_AMT ~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT +
## EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM +
## LIFETIME_EMAILS, data = new_train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8184 -0.2617 0.0464 0.3129 2.7266
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.0269641 0.0660255 91.282 < 2e-16 ***
## INCOME_LEVEL 0.0221973 0.0051046 4.348 1.41e-05 ***
## MONTHS_SINCE_LAST_GIFT 0.0220999 0.0023672 9.336 < 2e-16 ***
## EMAILS_12 0.0069177 0.0017136 4.037 5.52e-05 ***
## LIFETIME_AVG_GIFT_AMT 0.0004589 0.0000106 43.271 < 2e-16 ***
## URBANICITYR -0.0447049 0.0206907 -2.161 0.0308 *
## DONOR_GENDERM 0.0298582 0.0166641 1.792 0.0732 .
## LIFETIME_EMAILS -0.0014502 0.0010699 -1.355 0.1754
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5028 on 3866 degrees of freedom
## Multiple R-squared: 0.3888, Adjusted R-squared: 0.3877
## F-statistic: 351.3 on 7 and 3866 DF, p-value: < 2.2e-16
Interesting how one donor has led to an increase in R squared and adjusted r squared.
Also, that one donor made DONOR_GENDERM and LIFETIME_EMAILS significant and now they are not.
I will remove DONOR_GENDERM and LIFETIME_EMAILS
model4 <- lm(LOG_DONATION_AMT~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT + EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM,
data = new_train_data)
summary(model4)
##
## Call:
## lm(formula = LOG_DONATION_AMT ~ INCOME_LEVEL + MONTHS_SINCE_LAST_GIFT +
## EMAILS_12 + LIFETIME_AVG_GIFT_AMT + URBANICITYR + DONOR_GENDERM,
## data = new_train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8579 -0.2612 0.0484 0.3170 2.7066
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.017e+00 6.566e-02 91.645 < 2e-16 ***
## INCOME_LEVEL 2.248e-02 5.101e-03 4.407 1.08e-05 ***
## MONTHS_SINCE_LAST_GIFT 2.169e-02 2.348e-03 9.237 < 2e-16 ***
## EMAILS_12 5.991e-03 1.572e-03 3.812 0.00014 ***
## LIFETIME_AVG_GIFT_AMT 4.627e-04 1.022e-05 45.298 < 2e-16 ***
## URBANICITYR -4.637e-02 2.066e-02 -2.245 0.02483 *
## DONOR_GENDERM 2.991e-02 1.667e-02 1.795 0.07277 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5028 on 3867 degrees of freedom
## Multiple R-squared: 0.3885, Adjusted R-squared: 0.3876
## F-statistic: 409.5 on 6 and 3867 DF, p-value: < 2.2e-16
In model4, DONOR_GENDERM is not statistically significant because it has not met p <0.05. However, gennder differences are always stated in fundraising.
par(mfrow = c(2,2))
plot(model4)
#### Key findings
There seems to be another outlier donor 1457 just like donor 1286 who had a lifetime donation amount of $45000 which is significantly outside the donation range.
However, the model has improved following removal of donor 1286.
I could also remove donor 1457 but at the same time, i would reduce the predictive validity of this model and limit it to small donors which is not the reality where high value donors will always be present.
Large donors are influenced by factors such as wealth management strategies and possibility of a personal relationship with the said organization or specific cause and other factors that were not present in this data set.
Rather than removing donor 1457, i suggest a separate model for major donors.
predictions4 <- predict(model4, newdata = test_data)
# calculate the RMSE & MAE
residual4 <- test_data$LOG_DONATION_AMT - predictions4
rmse4 <- sqrt(mean(residual4^2))
mae4 <- mean(abs(residual4))
rmse4
## [1] 0.483537
mae4
## [1] 0.3680748
# back transform for interpretation
predicted_funds4 <- exp(predictions4)
actual_funds4 <- exp(test_data$LOG_DONATION_AMT)
# calculate the RMSE & MAE
rmse_funds4 <- sqrt(mean((actual_funds4 - predicted_funds4)^2))
mae_funds4 <- mean(abs(actual_funds4 - predicted_funds4))
rmse_funds4
## [1] 2028.789
mae_funds4
## [1] 623.351
On a log scale the model4 predicts RMSE: 0.48 and MAE: 0.37.
When transformed back to dollars the model’s predictions are off by $623.35 but if it makes bigger mistakes, its off by $ 2028.79
Model4 outperforms model 2 indicating improvement however when back transformed to dollar values, RMSE incrases to $623.35 which is suggests that the model struggles with high donor predictions.
I would suggest a separate model for high value donors.
Model - all variables
Model2 - only significant variables included
Model3 - outlier removed (donor 1286)
Model4 - backward elimination - final model
# Train the model on three equal datasets
num_models <- 3
subset_size <- nrow(new_train_data) # multiple (different but same size) random subsets of the data
reg_models<- list()
for (i in 1:num_models) {
sample_indices <- sample(1:nrow(new_train_data),
size = subset_size,
replace = TRUE)
subset <- new_train_data[sample_indices, ]
# train logistic regression
reg_models[[i]] <- lm(LOG_DONATION_AMT~.,data = subset)
}
# get predictions from each model
predictions_1 <- predict(reg_models[[1]], newdata = test_data)
predictions_2 <- predict(reg_models[[2]], newdata = test_data)
predictions_3 <- predict(reg_models[[3]], newdata = test_data)
# get the average of the three predictions
ensemble_prediction <- (predictions_3 + predictions_2 + predictions_1) / 3
# calculate the RMSE & MAE
residual_ensemble <- test_data$LOG_DONATION_AMT - ensemble_prediction
rmse_ensemble <- sqrt(mean(residual_ensemble^2))
mae_ensemble <- mean(abs(residual_ensemble))
rmse_ensemble
## [1] 0.4810096
mae_ensemble
## [1] 0.36548
# back transform to real funds for interpretation
predicted_funds_ensemble <- exp(ensemble_prediction)
actual_funds_ensemble <- exp(test_data$LOG_DONATION_AMT)
rmse_funds_ensemble <- sqrt(mean((actual_funds_ensemble - predicted_funds_ensemble)^2))
mae_funds_ensemble <- mean(abs(actual_funds_ensemble - predicted_funds_ensemble))
rmse_funds_ensemble
## [1] 2429.004
mae_funds_ensemble
## [1] 642.7275
On an ensemble model, the predictions are RMSE: 0.48 and MAE: 0.37 before log transformation and RMSE: 2429
and MAE: 642.73 after log transformation compared to to a single model (model4) with the predictions RMSE: 0.48 and MAE: 0.37 before log
transformation and RMSE: 2028.79 and MAE: 623.35 after log transformation.
There is slight but unsignificant change between the single model and the ensemble model. For linear regression algorithm, bagging has low effect due to low variance.
This model predicts typical donor behavior but is limited towards large donors.
save.image(file = "MuchukiM.DA5030.Practicum2-PartB.RData")