Data used in this model is queried from a proprietary database I’ve developed that contains property and rent data for all zip codes across the United States. Median home values are sourced from Zillow’s Monthly Time-Series Data and Median Income from the annual relase of the American Community Survey (U.S. Census)
library(DBI)
library(dplyr)
library(caret)
library(neuralnet)
library(xgboost)
sqlQuery <- function(query){
DB <- dbConnect(RMySQL::MySQL(), dbname = "MSA", username = "root", password = "M0vk7#B3D", host = "45.79.92.92", port = 3306)
on.exit(dbDisconnect(DB))
rs <- dbSendQuery(DB, query)
result <- fetch(rs, -1)
return(result)
}
myquery <- "SELECT *
FROM hvi_monthly;"
median_hvi <- sqlQuery(myquery)
## Warning: Closing open result sets
myquery <- "SELECT *
FROM income_census;"
median_income <- sqlQuery(myquery)
## Warning: Closing open result sets
Data is monthly time series, let’s use the median of the annual series.
summary_median_hvi <- median_hvi %>%
mutate(year = substr(date, 1,4)) %>%
group_by(year, zip) %>%
summarise(medianHVI = median(hvi)) %>% ungroup()
head(summary_median_hvi)
Merge Annual Median HVI with Median Income
median_income$year <- as.character(median_income$year )
complete <- median_income %>%
select(zip, year, medianIncome) %>%
left_join(summary_median_hvi, by = c("zip", "year"))
#Drop missing data
complete <- complete %>% filter(!is.na(medianHVI)) %>% filter(medianIncome > 0)
head(complete)
set.seed(2302)
trainIndex <- createDataPartition(complete$medianIncome, p = .7,
list = FALSE,
times = 1)
train <- complete[trainIndex,]
test <- complete[-trainIndex,]
#Home prices are skewed right so we'll take log form. Income is also skewed right but our model lin-log model performs slightly better than a log-log.
lm_fit <- lm(medianIncome ~ log(medianHVI), train)
summary(lm_fit)
##
## Call:
## lm(formula = medianIncome ~ log(medianHVI), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -147699 -9151 -472 8107 144697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -233432.20 1087.62 -214.6 <2e-16 ***
## log(medianHVI) 24399.10 90.17 270.6 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16910 on 74653 degrees of freedom
## Multiple R-squared: 0.4952, Adjusted R-squared: 0.4952
## F-statistic: 7.322e+04 on 1 and 74653 DF, p-value: < 2.2e-16
We can see from our R2 that median home price accounts for nearly 50% of the variance in medianIncome at the zip code level. Let’s check out our error metric (RMSE)
pred <- predict(lm_fit, test)
mae_lm <- MAE(test$medianIncome, pred)
Data Preprocessing We first have to to some rescaling between 0,1 for our NN model.
scale01 <- function(x){
(x - min(x)) / (max(x) - min(x))
}
complete$year <- as.numeric(complete$year)
complete$medianHVI_log <- log(complete$medianHVI) + 1
completeNN <- complete %>% select(-zip, -year) %>% mutate_all(scale01)
trainNN <- completeNN[trainIndex,]
testNN <- completeNN[-trainIndex,]
#NN Takes is quite computationally expensive, we'll use the first 2000 obs which will take less than 5 min to train.
trainNN <- trainNN[1:2000,]
Train Model
nn_fit <- neuralnet(medianIncome ~ medianHVI_log, data = trainNN, rep = 10)
Predict
pred <- predict(nn_fit, testNN, rep = 3)
#Inverse scale
pred_normal <- pred * (max(complete$medianIncome) - min(complete$medianIncome)) + min(complete$medianIncome)
testNN$medianIncome <- testNN$medianIncome * (max(complete$medianIncome) - min(complete$medianIncome)) + min(complete$medianIncome)
mae_nn <- MAE(test$medianIncome, pred_normal)
Data Preprocessing Setup grid for tuning parameters:
library(xgboost)
xgb_grid = expand.grid(
nrounds = 1000,
eta = c(0.1, 0.05, 0.01),
max_depth = c(2, 3, 4, 5, 6),
gamma = 0,
colsample_bytree=1,
min_child_weight=c(1, 2, 3, 4 ,5),
subsample=1
)
Use 5 fold cross validation and let caret find best hyperparameter values. Warning, very computationally expensive!
#xgb_caret <- train(x=train1[,-95], y=label_train, method='xgbTree', trControl= my_control, tuneGrid=xgb_grid)
#xgb_caret$bestTune
Above results yield optimal hyperparameter values as: Max Depth = 6 eta = .05 min_child_weight = 4
Convert to xgb.Dmatrix
# put our testing & training data into two seperates Dmatrixs objects
outcome <- train$medianIncome
dtrain <- train %>%
mutate(log_medianHVI = log(medianHVI)) %>%
select(log_medianHVI)
dtrain <- xgb.DMatrix(data = as.matrix(dtrain), label= outcome)
dtest <- test %>%
mutate(log_medianHVI = log(medianHVI)) %>%
select(log_medianHVI)
dtest <- xgb.DMatrix(data = as.matrix(dtest))
Define default parameter list based on caret cross validation.
default_param <-list(
objective = "reg:linear",
booster = "gbtree",
eta=0.03, #default = 0.3
gamma=0,
max_depth=6, #default=6
min_child_weight=1, #default=1
subsample=1,
colsample_bytree=1
)
CV & Train
Perform cross validation and determine best number of rounds for the above default parameters.
xgbcv <- xgb.cv( params = default_param, data = dtrain, nrounds = 500, nfold = 5, showsd = T, stratified = T, print_every_n = 40, early_stopping_rounds = 10, maximize = F)
## [1] train-rmse:63100.822656+58.723468 test-rmse:63100.985937+244.175322
## Multiple eval metrics are present. Will use test_rmse for early stopping.
## Will train until test_rmse hasn't improved in 10 rounds.
##
## [41] train-rmse:24532.983984+34.859737 test-rmse:24600.189063+291.862726
## [81] train-rmse:17421.858203+37.428281 test-rmse:17596.692969+209.659391
## [121] train-rmse:16591.424609+36.784114 test-rmse:16852.684375+163.455092
## [161] train-rmse:16462.750000+39.279805 test-rmse:16794.080078+152.598702
## Stopping. Best iteration:
## [173] train-rmse:16440.394922+39.613262 test-rmse:16793.092188+151.594450
Now let’s train the model using the best iteration found by cross validation.
xgb_mod <- xgb.train(data = dtrain, params=default_param, nrounds = 172)
Predict
#Predict on the test set
pred <- predict(xgb_mod, dtest)
#need to reverse the log to the real values
mae_xg <- MAE(test$medianIncome, pred)
print(c(mae_lm, mae_nn, mae_xg))
## [1] 12129.68 12375.10 11984.98
Success! Our XGBoost model improved upon our baseline LM even more computationally expensive Neural Network.