1 Introduction

The motivation for this analysis is to target homeowner outreach for the Home Repair Tax Credit (HRTC). The purpose of the HRTC is to encourage homeowners to maintain upkeep of their property that might less the impact of natural disasters like flooding. Over a 30-year span, 1 in 4 homes are likely to experience flooding. The annual per claim payout from the federally-backed National Flood Insurance Program is $43,000. In order to incentivize active flood management, FEMA uses the Community Rating System (CRS) to reduce flood insurance premiums for communities that are active flood-proofing their communities. If marketing for HRTC can be better targeted toward likely users of the credit, the local government of Emil City could see direct benefits such as increased property taxes and lower cost of flood insurance for homeowners and business. For the purposes of this project, I am defining the following as:

  1. True Positive - Predict correctly homeowner would take HRTC; allocated marketing resources, and 25% took the credit.
  2. True Negative - Predict correctly homeowner would not take HRTC, no marketing resources allocated; no credit was allocated.
  3. False Positive - Predict incorrectly homeowner would take HRTC; allocated marketing resources; no credit allocated.
  4. False Negative - Predict that a homeowner would not take HRTC; no marketing resources allocated but 11% of this group did take credit. These are likely homeowners who signed up for reasons unrelated to the marketing campaign. Thus, we ‘0 out’ this category, assuming the cost/benefit of this is $0.
options(scipen=10000000)

library(tidyverse)
library(caret)
library(knitr)
library(pscl)
library(plotROC)
library(pROC)
library(scales)
library(kableExtra)
library(lubridate)
library(gridExtra)

root.dir = "https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/DATA/"
source("https://raw.githubusercontent.com/urbanSpatial/Public-Policy-Analytics-Landing/master/functions.r")

# --- Setup: Aesthetics & Formula ----
palette5 <- c("#981FAC","#CB0F8B","#FF006A","#FE4C35","#FE9900")
palette4 <- c("#981FAC","#FF006A","#FE4C35","#FE9900")
palette2 <- c("#981FAC","#FF006A")

iterateThresholds <- function(data, observedClass, predictedProbs, group) {
  #This function takes as its inputs, a data frame with an observed binomial class (1 or 0); a vector of predicted probabilities; and optionally a group indicator like race. It returns accuracy plus counts and rates of confusion matrix outcomes. It's a bit verbose because of the if (missing(group)). I don't know another way to make an optional parameter.
  observedClass <- enquo(observedClass)
  predictedProbs <- enquo(predictedProbs)
  group <- enquo(group)
  x = .01
  all_prediction <- data.frame()
  
  if (missing(group)) {
    
    while (x <= 1) {
      this_prediction <- data.frame()
      
      this_prediction <-
        data %>%
        mutate(predclass = ifelse(!!predictedProbs > x, 1,0)) %>%
        count(predclass, !!observedClass) %>%
        summarize(Count_TN = sum(n[predclass==0 & !!observedClass==0]),
                  Count_TP = sum(n[predclass==1 & !!observedClass==1]),
                  Count_FN = sum(n[predclass==0 & !!observedClass==1]),
                  Count_FP = sum(n[predclass==1 & !!observedClass==0]),
                  Rate_TP = Count_TP / (Count_TP + Count_FN),
                  Rate_FP = Count_FP / (Count_FP + Count_TN),
                  Rate_FN = Count_FN / (Count_FN + Count_TP),
                  Rate_TN = Count_TN / (Count_TN + Count_FP),
                  Accuracy = (Count_TP + Count_TN) / 
                    (Count_TP + Count_TN + Count_FN + Count_FP)) %>%
        mutate(Threshold = round(x,2))
      
      all_prediction <- rbind(all_prediction,this_prediction)
      x <- x + .01
    }
    return(all_prediction)
  }
  else if (!missing(group)) { 
    while (x <= 1) {
      this_prediction <- data.frame()
      
      this_prediction <-
        data %>%
        mutate(predclass = ifelse(!!predictedProbs > x, 1,0)) %>%
        group_by(!!group) %>%
        count(predclass, !!observedClass) %>%
        summarize(Count_TN = sum(n[predclass==0 & !!observedClass==0]),
                  Count_TP = sum(n[predclass==1 & !!observedClass==1]),
                  Count_FN = sum(n[predclass==0 & !!observedClass==1]),
                  Count_FP = sum(n[predclass==1 & !!observedClass==0]),
                  Rate_TP = Count_TP / (Count_TP + Count_FN),
                  Rate_FP = Count_FP / (Count_FP + Count_TN),
                  Rate_FN = Count_FN / (Count_FN + Count_TP),
                  Rate_TN = Count_TN / (Count_TN + Count_FP),
                  Accuracy = (Count_TP + Count_TN) / 
                    (Count_TP + Count_TN + Count_FN + Count_FP)) %>%
        mutate(Threshold = round(x,2))
      
      all_prediction <- rbind(all_prediction,this_prediction)
      x <- x + .01
    }
    return(all_prediction)
  }
}
# --- Setup: Exploratory Analyses ----
## Summary 18-24, 25-34, 35-44, 45-54, 55-64 and 65 and over
# 
homecred <- read.csv(file.path(root.dir,"/Chapter6/housingSubsidy.csv")) %>%
  mutate(age_range = case_when(age %in% 18:39 ~ "18-39",
                               age %in% 40:64 ~ "40-64",
                               age >= 65 ~ "65 or older"),
         edu_level = case_when(education %in% c("basic.4y","basic.6y","basic.9y","high.school") ~"Up to high school",
                               education %in% c("professional.course","university.degree") ~ "Post-secondary",
                               education %in% c("illiterate","unknown") ~ "Other"),
         job_level = case_when(job %in% c("entrepreneur","self-employed") ~ "self-employed",
                               job %in% c("housemaid","blue-collar") ~ "labor",
                               job %in% c("services","technician") ~ "services-skilled",
                               job %in% c("admin","management") ~ "office",
                               job %in% c("student","retired") ~ "not working",
                               job %in% c("unemployed","unknown") ~ "other"),
         season = case_when(month %in% c("dec","jan","feb") ~ "winter",
                            month %in% c("mar","apr","may") ~ "spring",
                            month %in% c("jun","jul","aug") ~ "summer",
                            month %in% c("sep","oct","nov") ~ "autumn")) %>%
  na.omit()

2 Exploratory Analysis

As shown below, the majority of individuals do not take HRTC. However, past spending on repairs does appear to make a person more likely to take HRTC, likely because the spending on improvements has already been completed so there is a relatively low activation effort necessary to utilize the credit. Those contacted by cellphone also appear to be more receptive to taking the credit than those reached by telephone. Little other correlations are immediately obvious, even after combining several education, age, time, and job parameters into fewer levels. With the new features, there does appear to be slightly more update of HRTC in the spring and summer, as well as among those who are middle aged (40-64 years old), with post-secondary education.

# Bar plot, continuous
homecred %>%
  dplyr::select(y, campaign, pdays, inflation_rate, spent_on_repairs) %>%
  gather(Variable, value, -y) %>%
  ggplot(aes(y, value, fill=y)) + 
  geom_bar(position = "dodge", stat = "summary", fun = "mean") + 
  facet_wrap(~Variable, scales = "free") +
  scale_fill_manual(values = palette2) +
  labs(x="Took Home Repair Credit", y="Mean", 
       title = "Feature associations with the likelihood of taking HRTC",
       subtitle = "(Continous outcomes)") +
  plotTheme() + theme(legend.position = "none")

# Bar plot of associated features (catgeorical)
homecred %>%
  dplyr::select(y, contact, season, taxLien, mortgage, taxbill_in_phl, poutcome) %>%
  gather(Variable, value, -y) %>%
  count(Variable, value, y) %>%
  ggplot(aes(value, n, fill = y)) +   
  geom_bar(position = "dodge", stat="identity") +
  facet_wrap(~Variable, scales="free") +
  scale_fill_manual(values = palette2) +
  labs(x="Took Home Repair Credit", y="Count",
       title = "Feature associations with the likelihood of taking HRTC",
       subtitle = "Three category features") +
  plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Bar plot of associated features (catgeorical)
homecred %>%
  dplyr::select(y, age_range, job_level, marital, edu_level) %>%
  gather(Variable, value, -y) %>%
  count(Variable, value, y) %>%
  ggplot(aes(value, n, fill = y)) +   
  geom_bar(position = "dodge", stat="identity") +
  facet_wrap(~Variable, scales="free") +
  scale_fill_manual(values = palette2) +
  labs(x="Took Home Repair Credit", y="Count",
       title = "Feature associations with the likelihood of taking HRTC",
       subtitle = "Demographic") +
  plotTheme() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

3 Model Estimation

After engineering the season, edu_level, age_range, and job_level features, McFadden’s Pseudo-R2 actually decreases from 0.27 to 0.23.

# Visualization
ggplot(testProbs1, aes(x = Probs, fill = as.factor(Outcome))) + 
    geom_density() +
    facet_grid(Outcome ~ .) +
    scale_fill_manual(values = palette2) + xlim(0, 1) +
    labs(x = "Takes HRTC", y = "Density of probabilities",
         title = "Distribution of predicted probabilities by observed outcome") +
    plotTheme() + theme(strip.text.x = element_text(size = 18),
                        legend.position = "none")

testProbs1 <- 
  testProbs1 %>%
  mutate(predOutcome  = as.factor(ifelse(testProbs1$Probs > 0.3 , 1, 0)))

testProbs2 <- 
  testProbs2 %>%
  mutate(predOutcome  = as.factor(ifelse(testProbs2$Probs > 0.3 , 1, 0)))

3.1 Accuracy Metrics

When the ROC curves of both models are compared, Model 2 appears how show a smoother curve, and has a greater AUC (0.7454872 vs. Model 1’s 0.7308491). However, Model 1 has a higher sensitivity than Model 2, but Model 2 has higher specificity. When looaking at the confusion matrices below, we do indeed see a greater number of true positive and true negative predictions with Model 1. Model 1 will be used moving forward, despite having a lower AUC.

# Cross-Validation
ctrl <- trainControl(method = "cv", number = 100, classProbs=TRUE, summaryFunction=twoClassSummary)

cvFit1 <- train(y ~ ., data = homecred %>% 
                 dplyr::select(y, age, job, month, contact, poutcome, cons.price.idx, 
                               unemploy_rate, inflation_rate), 
               method="glm", family="binomial",
               metric="ROC", trControl = ctrl)
cvFit2 <- train(y ~ ., data = homecred %>% 
                 dplyr::select(y, age_range, job_level, edu_level, season, contact, poutcome,
                               cons.price.idx, unemploy_rate, inflation_rate), 
               method="glm", family="binomial",
               metric="ROC", trControl = ctrl)

cvFit1
cvFit2

3.2 Model 1 ROC Curve

As shown below, the ROC curve for Model 1 indicates that it is a decent, albeit not extremely high-performing predictor for HRTC utilization.

# Confusion Matrix
caret::confusionMatrix(testProbs1$predOutcome, testProbs1$Outcome, 
                       positive = "1")
# Receiver Operating Characteristic (ROC) Curves
ggplot(testProbs1, aes(d = as.numeric(testProbs1$Outcome), m = Probs)) +
    geom_roc(n.cuts = 50, labels = FALSE, colour = "#FE9900") +
    style_roc(theme = theme_grey) +
    geom_abline(slope = 1, intercept = 0, size = 1.5, color = 'grey') +
    labs(title = "ROC Curve - Model 1")

3.3 Generalizability

Model 1 does poorly on generalizability, with decent sensitivity, but low specificity and ROC.

# create cvFit 
dplyr::select(cvFit1$resample, -Resample) %>%
  gather(metric, value) %>%
  left_join(gather(cvFit1$results[2:4], metric, mean)) %>%
  ggplot(aes(value)) + 
  geom_histogram(bins=35, fill = "#FF006A") +
  facet_wrap(~metric) +
  geom_vline(aes(xintercept = mean), colour = "#981FAC", linetype = 3, size = 1.5) +
  scale_x_continuous(limits = c(0, 1)) +
  labs(x="Goodness of Fit", y="Count", title="CV Goodness of Fit Metrics",
       subtitle = "Across-fold mean reprented as dotted lines") +
  plotTheme()

4 Cost Benefit Analysis

When considering costs and benefits, revenue is calculated as follows for each possible confusion matrix outcome. Local community efforts to reduce flood risk has been shown to decrease flood insurance premiums by as much as $300 per policy. The is the number I will use when calculating the benefit of taking HRTC. Increase in tax revenue is assumed to be 3%, taken from combining both state and local taxes in Philly.

The revenue from a true positive outcome is calculated by considering the total cost of marketing to and then paying out the HRTC as well as the increase in total property taxes from the $10,000 increase in home value as well as the $56,000 increase in the price of the surrounding homes. Additionally, the average repair cost in $6600, another benefit.

Total

  1. True negative revenue: : $0
  • No marketing cost, no benefit gained
  1. True positive revenue: ((($10,000 + $56,000) * 3%) + $300 + $6600 + $2,150 - ($2,850 + $5,000)) * 25% of those who take HRTC after receiving marketing = $2,758
  • Property tax revenue from the total $66K increase in property values in and around a particular house
  • $300 reduction in flood insurane premium
  • $6,600 average spending on labor and materials for home repair work (benefit to local economy)
  • $2,150 saved from home price increase due to HRTC program, compared to direct cash transfer of $10,000 to homeowner
  • Minus marketing and HRTC cost
  1. False negative revenue: $0 - ((($10,000 + $56,000) * 3%) + $300 + $6600) * 14% opportunity cost = $1,544
  • No marketing cost, no benefit
  • Opportunity cost of 14% less HRTC utilization among those not contacted (25% - 11%)
  1. False positive revenue: $0 - $2,850 = $2,850
  • Marketing cost, but no HRTC cost
  • No benefits
# Cost-Benefit
cost_benefit_table <-
  testProbs1 %>%
  count(predOutcome, Outcome) %>%
  summarize(True_Negative = sum(n[predOutcome==0 & Outcome==0]),
            True_Positive = sum(n[predOutcome==1 & Outcome==1]),
            False_Negative = sum(n[predOutcome==0 & Outcome==1]),
            False_Positive = sum(n[predOutcome==1 & Outcome==0])) %>%
  gather(Variable, Count) %>%
  mutate(Revenue =
           case_when(Variable == "True_Negative"  ~ (Count * 0),
                     Variable == "True_Positive"  ~ (Count * 2758), 
                     Variable == "False_Negative" ~ (Count * -1544),
                     Variable == "False_Positive" ~ (Count * -2850))) %>%
  bind_cols(data.frame(Description = c(
    "We predicted would not take HRTC and did not contact",
    "We predicted would take HRTC and did after being contacted",
    "We predicted would not take HRTC and did not contact",
    "We predicted would take HRTC and contacted but homeowner did not take")))
knitr::kable(cost_benefit_table, format = "html", output = TRUE) %>%
  kable_styling()
Variable Count Revenue Description
True_Negative 1726 0 We predicted would not take HRTC and did not contact
True_Positive 78 215124 We predicted would take HRTC and did after being contacted
False_Negative 128 -197632 We predicted would not take HRTC and did not contact
False_Positive 86 -245100 We predicted would take HRTC and contacted but homeowner did not take

5 Optimizing Threshold

Taking the cost benefit analysis as laid out above, I optimized to determine which threshold would be best. However, because of the higher cost of false positives compared to true positives, no threshold produced a beneficial outcome–all campaigns lost more money than they brought in benefits. However, the threshold that loses the least money is 0.68, which would only lose $210,136, compared to a 0.50 threshold which would lose $221,104. The count of HRTC use for both thresholds is 2018, which indicates that the reduction in cost at 0.68 is due solely to decrease in false negatives/positives, not any greater sensitivity in the model.

# Optimize
whichThreshold <- 
  iterateThresholds(
    data=testProbs1, observedClass = Outcome, predictedProbs = Probs)

whichThreshold <- 
  whichThreshold %>%
  dplyr::select(starts_with("Count"), Threshold) %>%
  gather(Variable, Count, -Threshold) %>%
  mutate(Revenue =
           case_when(Variable == "Count_TN"  ~ (Count * 0),
                     Variable == "Count_TP"  ~ (Count * 2758),
                     Variable == "Count_FN" ~ (Count * -1544),
                     Variable == "Count_FP" ~ (Count * -2850)))

whichThreshold %>%
  ggplot(.,aes(Threshold, Revenue, colour = Variable)) +
  geom_point() +
  scale_colour_manual(values = palette5[c(5, 1:3)]) +    
  labs(title = "Revenue by confusion matrix metric and threshold",
       y = "Revenue") +
  plotTheme() +
  guides(colour=guide_legend(title = "Confusion Matrix")) 

# Calculate revenue by confusion metrics
whichThreshold_revenue <- 
  whichThreshold %>% 
  mutate(actualHomeCred = ifelse(Variable == "Count_TP", (Count * .25),
                              ifelse(Variable == "Count_FN", (Count * .11), 0))) %>% 
  group_by(Threshold) %>% 
  summarize(Revenue = sum(Revenue),
            Count = sum(Count)) 

whichThreshold_revenue %>%
  dplyr::select(Threshold, Revenue, Count) %>%
  gather(Variable, Value, -Threshold) %>%
  ggplot(aes(Threshold, Value, colour = Variable)) +
                 geom_point() +
                 geom_vline(xintercept = pull(arrange(whichThreshold_revenue, -Revenue)[1,1])) +
                 geom_text(aes(x=(pull(arrange(whichThreshold_revenue, -Revenue)[1,1])-.025), 
                               label=pull(arrange(whichThreshold_revenue, -Revenue)[1,1]),
                               y=-4000000), color="black", angle=90, text=element_text(size=11)) +
                 scale_colour_manual(values = palette2) +
                 plotTheme() +
                 labs(title = "Number of Credits Taken and Revenue by Threshold",
                      subtitle = "Vertical line denotes optimal threshold")

whichThreshold_compare <- 
  filter(whichThreshold_revenue, Threshold == "0.5" | Threshold == "0.68")

5.1 Revenue Loss Comparison by Threshold

knitr::kable(whichThreshold_compare) %>%
  kable_styling()
Threshold Revenue Count
0.50 -221104 2018
0.68 -210136 2018

6 Conclusion

This model is not ready for primetime. Unless the model incorporate additional related variables, it is not sufficiently sensitive to result in more benefit than cost. Additionally, unless the cost of marketing can be reduced (e.g., adopting a social media strategy rather than a purely phone strategy), false positives will continue to cost more than true positives, which will necessitate an extremely high degree of accuracy to produce any benefit.