Data Preprocessing

Data used in this model is queried from a proprietary database I’ve developed that contains property and rent data for all zip codes across the United States. Median home values are sourced from Zillow’s Monthly Time-Series Data and Median Income from the annual relase of the American Community Survey (U.S. Census)

library(DBI)
library(dplyr)
library(caret)
library(neuralnet)
library(xgboost)

sqlQuery <- function(query){
  DB <-  dbConnect(RMySQL::MySQL(), dbname = "MSA", username = "root", password = "M0vk7#B3D", host = "45.79.92.92", port = 3306)
  on.exit(dbDisconnect(DB))
  rs <- dbSendQuery(DB, query)
  result <- fetch(rs, -1)
  return(result)
}

myquery <- "SELECT *
FROM hvi_monthly;"

median_hvi <- sqlQuery(myquery)

## Warning: Closing open result sets

myquery <- "SELECT *
FROM income_census;"
median_income <- sqlQuery(myquery)

## Warning: Closing open result sets

Data is monthly time series, let’s use the median of the annual series.

summary_median_hvi <- median_hvi %>% 
  mutate(year = substr(date, 1,4)) %>%
  group_by(year, zip) %>%
  summarise(medianHVI = median(hvi)) %>% ungroup()

head(summary_median_hvi)

Merge Annual Median HVI with Median Income

median_income$year <- as.character(median_income$year )
complete <- median_income %>%
  select(zip, year, medianIncome) %>%
  left_join(summary_median_hvi, by = c("zip", "year"))

#Drop missing data
complete <- complete %>% filter(!is.na(medianHVI)) %>% filter(medianIncome > 0)

head(complete)

Modeling

LM (Baseline)

set.seed(2302)
trainIndex <- createDataPartition(complete$medianIncome, p = .7, 
                                  list = FALSE, 
                                  times = 1)

train <- complete[trainIndex,]
test <- complete[-trainIndex,]

#Home prices are skewed right so we'll take log form.  Income is also skewed right but our model lin-log model performs slightly better than a log-log.

lm_fit <- lm(medianIncome ~ log(medianHVI), train)

summary(lm_fit)

## 
## Call:
## lm(formula = medianIncome ~ log(medianHVI), data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -147699   -9151    -472    8107  144697 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -233432.20    1087.62  -214.6   <2e-16 ***
## log(medianHVI)   24399.10      90.17   270.6   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 16910 on 74653 degrees of freedom
## Multiple R-squared:  0.4952, Adjusted R-squared:  0.4952 
## F-statistic: 7.322e+04 on 1 and 74653 DF,  p-value: < 2.2e-16

We can see from our R2 that median home price accounts for nearly 50% of the variance in medianIncome at the zip code level. Let’s check out our error metric (RMSE)

pred <- predict(lm_fit, test)

mae_lm <- MAE(test$medianIncome, pred)

Neural Network

Data Preprocessing We first have to to some rescaling between 0,1 for our NN model.

scale01 <- function(x){
  (x - min(x)) / (max(x) - min(x))
}

complete$year <- as.numeric(complete$year)
complete$medianHVI_log <- log(complete$medianHVI) + 1
completeNN <- complete %>% select(-zip, -year) %>% mutate_all(scale01)
trainNN <- completeNN[trainIndex,]
testNN <- completeNN[-trainIndex,]

#NN Takes is quite computationally expensive, we'll use the first 2000 obs which will take less than 5 min to train.
trainNN <- trainNN[1:2000,]

Train Model

nn_fit <- neuralnet(medianIncome ~ medianHVI_log, data = trainNN, rep = 10)

Predict

pred <- predict(nn_fit, testNN, rep = 3)

#Inverse scale
pred_normal <- pred * (max(complete$medianIncome) - min(complete$medianIncome)) + min(complete$medianIncome)
testNN$medianIncome <- testNN$medianIncome * (max(complete$medianIncome) - min(complete$medianIncome)) + min(complete$medianIncome)
mae_nn <- MAE(test$medianIncome, pred_normal)

XGBOOST

Data Preprocessing Setup grid for tuning parameters:

library(xgboost)
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1
)

Use 5 fold cross validation and let caret find best hyperparameter values. Warning, very computationally expensive!

#xgb_caret <- train(x=train1[,-95], y=label_train, method='xgbTree', trControl= my_control, tuneGrid=xgb_grid) 
#xgb_caret$bestTune

Above results yield optimal hyperparameter values as: Max Depth = 6 eta = .05 min_child_weight = 4

Convert to xgb.Dmatrix

# put our testing & training data into two seperates Dmatrixs objects
outcome <- train$medianIncome
dtrain <- train %>% 
  mutate(log_medianHVI = log(medianHVI)) %>%
select(log_medianHVI)
dtrain <- xgb.DMatrix(data = as.matrix(dtrain), label= outcome)
dtest <- test %>% 
   mutate(log_medianHVI = log(medianHVI)) %>%
select(log_medianHVI)
dtest <- xgb.DMatrix(data = as.matrix(dtest))

Define default parameter list based on caret cross validation.

default_param <-list(
        objective = "reg:linear",
        booster = "gbtree",
        eta=0.03, #default = 0.3
        gamma=0,
        max_depth=6, #default=6
        min_child_weight=1, #default=1
        subsample=1,
        colsample_bytree=1
)

CV & Train

Perform cross validation and determine best number of rounds for the above default parameters.

xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 500, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)

## [1]  train-rmse:63100.822656+58.723468   test-rmse:63100.985937+244.175322 
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
## 
## [41] train-rmse:24532.983984+34.859737   test-rmse:24600.189063+291.862726 
## [81] train-rmse:17421.858203+37.428281   test-rmse:17596.692969+209.659391 
## [121]    train-rmse:16591.424609+36.784114   test-rmse:16852.684375+163.455092 
## [161]    train-rmse:16462.750000+39.279805   test-rmse:16794.080078+152.598702 
## Stopping. Best iteration:
## [173]    train-rmse:16440.394922+39.613262   test-rmse:16793.092188+151.594450

Now let’s train the model using the best iteration found by cross validation.

xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 172)

Predict

#Predict on the test set
pred <- predict(xgb_mod, dtest)
#need to reverse the log to the real values
mae_xg <- MAE(test$medianIncome, pred)

Results

print(c(mae_lm, mae_nn, mae_xg))

## [1] 12129.68 12375.10 11984.98

Success! Our XGBoost model improved upon our baseline LM even more computationally expensive Neural Network.

Predicting Median Income Using Median Home Value