Code Through - Random Forest Package

Introduction

This tutorial introduced the random Forest package and variable importance plot for model variable selection. We will use census data and random Forest to determine which variables have the greatest impact on median home value change. Random Forest uses a decision tree design to classify variables in relation to each other and the selected “y”. By analyzing these relationships, it can determine which variable would generate the greatest chain reaction if removed, thereby determining the greatest impact.

Data

For this tutorial we will use data from the Longitudinal Tract Database (LTDB) Project, a project that has harmonized a set of variables from 1970 to 2010.

URL1 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2000.rds"
d1 <- readRDS( gzcon( url( URL1 ) ) )

URL2 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2010.rds"
d2 <- readRDS( gzcon( url( URL2 ) ) )

URLmd <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-META-DATA.rds"
md <- readRDS( gzcon( url( URLmd ) ) )

d1 <- select( d1, - year )
d2 <- select( d2, - year )

d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )

Filter Rural Districts

#table( d$urban )
d <- filter( d, urban == "urban" )

Create New Variables

d <- select( d, tractid, 
             mhmval00, mhmval12, 
             hinc00, 
             hu00, vac00, own00, rent00, h30old00,
             empclf00, clf00, unemp00, prof00,  
             dpov00, npov00,
             ag25up00, hs00, col00, 
             pop00.x, nhwht00, nhblk00, hisp00, asian00,
             cbsa, cbsaname )

d <- 
  d %>%
  mutate( # percent white in 2000
          p.white = 100 * nhwht00 / pop00.x,
          # percent black in 2000
          p.black = 100 * nhblk00 / pop00.x,
          # percent hispanic in 2000
          p.hisp = 100 * hisp00 / pop00.x, 
          # percent asian in 2000
          p.asian = 100 * asian00 / pop00.x,
          # percent high school grads by age 25 in 2000 
          p.hs = 100 * (hs00+col00) / ag25up00,
          # percent pop with college degree in 2000
          p.col = 100 * col00 / ag25up00,
          # percent employed in professional fields in 2000
          p.prof = 100 * prof00 / empclf00,
          # percent unemployment  in 2000
          p.unemp = 100 * unemp00 / clf00,
          # percent of housing lots in tract that are vacant in 2000
          p.vacant = 100 * vac00 / hu00,
          # dollar change in median home value 2000 to 2010 
          pov.rate = 100 * npov00 / dpov00 )


# adjust 2000 home values for inflation 
mhv.00 <- d$mhmval00 * 1.28855  
mhv.10 <- d$mhmval12

# change in MHV in dollars
mhv.change <- mhv.10 - mhv.00


# drop low 2000 median home values
# to avoid unrealistic growth rates.
#
# tracts with homes that cost less than
# $1,000 are outliers
mhv.00[ mhv.00 < 1000 ] <- NA

# change in MHV in percent
mhv.growth <- 100 * ( mhv.change / mhv.00 )

d$mhv.00 <- mhv.00
d$mhv.10 <- mhv.10
d$mhv.change <- mhv.change
d$mhv.growth <- mhv.growth

Visualize Median Home Values

hist( mhv.00, breaks=200, xlim=c(0,500000), 
      col="gray20", border="white",
      axes=F, 
      xlab="MHV (median = $138k)",
      ylab="",
      main="Median Home Value in 2000 (2010 US dollars)" )

axis( side=1, at=seq(0,500000,100000), 
      labels=c("$0","$100k","$200k","$300k","$400k","$500k") )

abline( v=median( mhv.00, na.rm=T ), col="orange", lwd=3 )

Descriptives

df <- data.frame( MedianHomeValue2000=mhv.00, 
                  MedianHomeValue2010=mhv.10, 
                  MHV.Change.00.to.10=mhv.change,
                  MHV.Growth.00.to.12=mhv.growth )

stargazer( df, 
           type="html", 
           digits=0, 
           summary.stat = c("min", "p25","median","mean","p75","max") )


Statistic	Min	Pctl(25)	Median	Mean	Pctl(75)	Max

MedianHomeValue2000	1,280	105,661	154,884	187,119	224,337	1,288,551
MedianHomeValue2010	9,999	123,200	193,200	246,570	312,000	1,000,001
MHV.Change.00.to.10	-1,228,651	7,187	36,268	60,047	94,881	1,000,001
MHV.Growth.00.to.12	-97	6	25	34	50	17,494

Visualize Change in MHV 2000-2010

If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.

The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.

hist( mhv.change/1000, breaks=500, 
      xlim=c(-100,500), yaxt="n", xaxt="n",
      xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
      ylab="", main="Change in Median Home Value 2000 to 2010",
      col="gray20", border="white" )

axis( side=1, at=seq( from=-100, to=500, by=100 ), 
      labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
        
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500, 
      labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ), 
      col="darkorange", cex=1.8, pos=3 )

median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000, 
      labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ), 
      col="dodgerblue", cex=1.8, pos=3 )

Variable Selection

First, we need select which variable from the data set we want to predict. For this tutorial I will be using the change in median home value as y. Then, using random Forest, we can find which remaining variables in the data set have the greatest impact on y (some data cleaning my be required). Finally, a variable importance plot will show the effect.

#install.packages('randomForest')
#install.packages('glmnet')
library(glmnet)
library(randomForest)

Applying random Forest

#creating variable selection decision tree 

#   rf1 = randomForest(d$mhv.change ~. , importance = TRUE, na.rm=TRUE, data = d)

#the above code yeilds NA errors 


#identifying errors
str(d)

## 'data.frame':    59722 obs. of  39 variables:
##  $ tractid   : chr  "fips-01-001-020100" "fips-01-001-020200" "fips-01-001-020300" "fips-01-001-020400" ...
##  $ mhmval00  : num  76600 72900 79900 89800 116594 ...
##  $ mhmval12  : num  121500 130500 118700 133500 174500 ...
##  $ hinc00    : num  36685 30298 46731 46142 58886 ...
##  $ hu00      : num  769 731 1263 1871 2282 ...
##  $ vac00     : num  93 67 61 111 80.4 ...
##  $ own00     : num  518 452 869 1390 1671 ...
##  $ rent00    : num  158 212 333 370 531 ...
##  $ h30old00  : num  225 329 452 979 152 ...
##  $ empclf00  : num  826 722 1414 2114 2906 ...
##  $ clf00     : num  872 802 1456 2191 2955 ...
##  $ unemp00   : num  46 80 42 77 49.3 ...
##  $ prof00    : num  221 154 438 673 1173 ...
##  $ dpov00    : num  1790 1907 3262 4551 6048 ...
##  $ npov00    : num  227 433 250 207 223 ...
##  $ ag25up00  : num  1227 1157 2130 3072 3785 ...
##  $ hs00      : num  635 740 990 1477 1257 ...
##  $ col00     : num  192 170 478 708 1214 ...
##  $ pop00.x   : num  1921 1892 3339 4556 6054 ...
##  $ nhwht00   : num  1723 671 2738 4273 5427 ...
##  $ nhblk00   : num  145 1177 498 118 367 ...
##  $ hisp00    : num  12 16 55 101 95.2 ...
##  $ asian00   : num  8 12 27 40 113 ...
##  $ cbsa      : chr  "33860" "33860" "33860" "33860" ...
##  $ cbsaname  : chr  "Montgomery, AL" "Montgomery, AL" "Montgomery, AL" "Montgomery, AL" ...
##  $ p.white   : num  89.7 35.5 82 93.8 89.6 ...
##  $ p.black   : num  7.55 62.21 14.91 2.59 6.07 ...
##  $ p.hisp    : num  0.625 0.846 1.647 2.217 1.573 ...
##  $ p.asian   : num  0.416 0.634 0.809 0.878 1.867 ...
##  $ p.hs      : num  67.4 78.7 68.9 71.1 65.3 ...
##  $ p.col     : num  15.6 14.7 22.4 23 32.1 ...
##  $ p.prof    : num  26.8 21.3 31 31.8 40.4 ...
##  $ p.unemp   : num  5.28 9.98 2.88 3.51 1.67 ...
##  $ p.vacant  : num  12.09 9.17 4.83 5.93 3.52 ...
##  $ pov.rate  : num  12.68 22.71 7.66 4.55 3.69 ...
##  $ mhv.00    : num  98703 93935 102955 115712 150237 ...
##  $ mhv.10    : num  121500 130500 118700 133500 174500 ...
##  $ mhv.change: num  22797 36565 15745 17788 24263 ...
##  $ mhv.growth: num  23.1 38.9 15.3 15.4 16.2 ...

#random forest will not run if there are fields with missing values or character variables. These need to be corrected.
#mhmval12
        #  p.white
        #  p.black
        #  p.hisp
        #  p.asian 
        #  p.hs
        #  p.col
        #  p.prof
        #  p.unemp
        #  p.vacant
        #  pov.rate
        #  mhv.00
        #  mhv.10
        #  mhv.change
        #  mhv.growth

Data Cleaning

Resolving NA’s in variables

For the purpose of this tutorial, we will simply replace missing values in proportions with 0’s. Median/mean replacement values could be used for values that are not proportions.

Note: other investigations and replacements may be needed depending on the data set.

#replacing NAs with 0s
d$p.white = as.numeric(d$p.white)
d$p.white[is.na(d$p.white) == TRUE] = 0

d$p.black = as.numeric(d$p.black)
d$p.black[is.na(d$p.black) == TRUE] = 0

d$p.hisp = as.numeric(d$p.hisp)
d$p.hisp[is.na(d$p.hisp) == TRUE] = 0

d$p.asian = as.numeric(d$p.asian)
d$p.asian[is.na(d$p.asian) == TRUE] = 0

d$p.hs = as.numeric(d$p.hs)
d$p.hs[is.na(d$p.hs) == TRUE] = 0

d$p.col = as.numeric(d$p.col)
d$p.col[is.na(d$p.col) == TRUE] = 0

d$p.prof = as.numeric(d$p.prof)
d$p.prof[is.na(d$p.prof) == TRUE] = 0

d$p.unemp = as.numeric(d$p.unemp)
d$p.unemp[is.na(d$p.unemp) == TRUE] = 0

d$p.vacant = as.numeric(d$p.vacant)
d$p.vacant[is.na(d$p.vacant) == TRUE] = 0

d$pov.rate = as.numeric(d$pov.rate)
d$pov.rate[is.na(d$pov.rate) == TRUE] = 0

Resolving factor varialbes

There are some variables with more that 53 (maximum for random Forest) factor levels. For this tutorial, we will remove these variables and any others that may have a biased influence on y.

Note: it may be better to simplify and combine like factor levels instead of removing them completely.

#removing column completly to reduce bias 
d <- d[,!names(d) %in% c("mhmval00", "mhmval12", "mhv.growth", "mhv.00", "mhv.10", "tractid", "cbsa", "cbsaname")]

d$mhv.change = as.numeric(d$mhv.change)
d$mhv.change[is.na(d$mhv.change) == TRUE] = 0

#str(d)
#summary(d)

Applying Random Forest with cleaned data

#creating variable selection decision tree 
rf1 = randomForest(d$mhv.change ~. , importance = TRUE, data = d)

#visualizing predictor variables 
varImpPlot(rf1, n.var=min(9, nrow(d$importance)))

Interpreting the results

The first plot (%IncMSE) shows how much the variable impacts y. The second plot (IncNodePurity) measures the impurity index of the split tree calculation. For this tutorial, only the first plot is relevant. This shows that h30old00, p.prof, and p.vacant variables have the greatest impact on change in median home value.

Next Steps

Moving forward, I would recommended:

expand the data set to include 2020 variables
create change variables
re-run random Forest with percent change in median home value as y
better cleaning of data sets

You can learn more about the random Forest package and variable importance plot here: https://cran.r-project.org/web/packages/randomForest/randomForest.pdf