This tutorial introduced the random Forest package and variable importance plot for model variable selection. We will use census data and random Forest to determine which variables have the greatest impact on median home value change. Random Forest uses a decision tree design to classify variables in relation to each other and the selected “y”. By analyzing these relationships, it can determine which variable would generate the greatest chain reaction if removed, thereby determining the greatest impact.
For this tutorial we will use data from the Longitudinal Tract Database (LTDB) Project, a project that has harmonized a set of variables from 1970 to 2010.
URL1 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2000.rds"
d1 <- readRDS( gzcon( url( URL1 ) ) )
URL2 <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-2010.rds"
d2 <- readRDS( gzcon( url( URL2 ) ) )
URLmd <- "https://github.com/DS4PS/cpp-529-fall-2020/raw/main/LABS/data/rodeo/LTDB-META-DATA.rds"
md <- readRDS( gzcon( url( URLmd ) ) )
d1 <- select( d1, - year )
d2 <- select( d2, - year )
d <- merge( d1, d2, by="tractid" )
d <- merge( d, md, by="tractid" )#table( d$urban )
d <- filter( d, urban == "urban" )d <- select( d, tractid,
mhmval00, mhmval12,
hinc00,
hu00, vac00, own00, rent00, h30old00,
empclf00, clf00, unemp00, prof00,
dpov00, npov00,
ag25up00, hs00, col00,
pop00.x, nhwht00, nhblk00, hisp00, asian00,
cbsa, cbsaname )
d <-
d %>%
mutate( # percent white in 2000
p.white = 100 * nhwht00 / pop00.x,
# percent black in 2000
p.black = 100 * nhblk00 / pop00.x,
# percent hispanic in 2000
p.hisp = 100 * hisp00 / pop00.x,
# percent asian in 2000
p.asian = 100 * asian00 / pop00.x,
# percent high school grads by age 25 in 2000
p.hs = 100 * (hs00+col00) / ag25up00,
# percent pop with college degree in 2000
p.col = 100 * col00 / ag25up00,
# percent employed in professional fields in 2000
p.prof = 100 * prof00 / empclf00,
# percent unemployment in 2000
p.unemp = 100 * unemp00 / clf00,
# percent of housing lots in tract that are vacant in 2000
p.vacant = 100 * vac00 / hu00,
# dollar change in median home value 2000 to 2010
pov.rate = 100 * npov00 / dpov00 )
# adjust 2000 home values for inflation
mhv.00 <- d$mhmval00 * 1.28855
mhv.10 <- d$mhmval12
# change in MHV in dollars
mhv.change <- mhv.10 - mhv.00
# drop low 2000 median home values
# to avoid unrealistic growth rates.
#
# tracts with homes that cost less than
# $1,000 are outliers
mhv.00[ mhv.00 < 1000 ] <- NA
# change in MHV in percent
mhv.growth <- 100 * ( mhv.change / mhv.00 )
d$mhv.00 <- mhv.00
d$mhv.10 <- mhv.10
d$mhv.change <- mhv.change
d$mhv.growth <- mhv.growth hist( mhv.00, breaks=200, xlim=c(0,500000),
col="gray20", border="white",
axes=F,
xlab="MHV (median = $138k)",
ylab="",
main="Median Home Value in 2000 (2010 US dollars)" )
axis( side=1, at=seq(0,500000,100000),
labels=c("$0","$100k","$200k","$300k","$400k","$500k") )
abline( v=median( mhv.00, na.rm=T ), col="orange", lwd=3 )
df <- data.frame( MedianHomeValue2000=mhv.00,
MedianHomeValue2010=mhv.10,
MHV.Change.00.to.10=mhv.change,
MHV.Growth.00.to.12=mhv.growth )
stargazer( df,
type="html",
digits=0,
summary.stat = c("min", "p25","median","mean","p75","max") )| Statistic | Min | Pctl(25) | Median | Mean | Pctl(75) | Max |
| MedianHomeValue2000 | 1,280 | 105,661 | 154,884 | 187,119 | 224,337 | 1,288,551 |
| MedianHomeValue2010 | 9,999 | 123,200 | 193,200 | 246,570 | 312,000 | 1,000,001 |
| MHV.Change.00.to.10 | -1,228,651 | 7,187 | 36,268 | 60,047 | 94,881 | 1,000,001 |
| MHV.Growth.00.to.12 | -97 | 6 | 25 | 34 | 50 | 17,494 |
If a home worth $10 million increased in value by $100k over ten years it would not be that surprising. If a home worth $50k increased by $100k over the same period that is a growth of 200% and is notable.
The change in value variable only reports absolute change, but does not provide a sense of whether that is a big or small amount for the census tract.
hist( mhv.change/1000, breaks=500,
xlim=c(-100,500), yaxt="n", xaxt="n",
xlab="Thousand of US Dollars (adjusted to 2010)", cex.lab=1.5,
ylab="", main="Change in Median Home Value 2000 to 2010",
col="gray20", border="white" )
axis( side=1, at=seq( from=-100, to=500, by=100 ),
labels=paste0( "$", seq( from=-100, to=500, by=100 ), "k" ) )
mean.x <- mean( mhv.change/1000, na.rm=T )
abline( v=mean.x, col="darkorange", lwd=2, lty=2 )
text( x=200, y=1500,
labels=paste0( "Mean = ", dollar( round(1000*mean.x,0)) ),
col="darkorange", cex=1.8, pos=3 )
median.x <- median( mhv.change/1000, na.rm=T )
abline( v=median.x, col="dodgerblue", lwd=2, lty=2 )
text( x=200, y=2000,
labels=paste0( "Median = ", dollar( round(1000*median.x,0)) ),
col="dodgerblue", cex=1.8, pos=3 )
First, we need select which variable from the data set we want to predict. For this tutorial I will be using the change in median home value as y. Then, using random Forest, we can find which remaining variables in the data set have the greatest impact on y (some data cleaning my be required). Finally, a variable importance plot will show the effect.
#install.packages('randomForest')
#install.packages('glmnet')
library(glmnet)
library(randomForest)#creating variable selection decision tree
# rf1 = randomForest(d$mhv.change ~. , importance = TRUE, na.rm=TRUE, data = d)
#the above code yeilds NA errors
#identifying errors
str(d)## 'data.frame': 59722 obs. of 39 variables:
## $ tractid : chr "fips-01-001-020100" "fips-01-001-020200" "fips-01-001-020300" "fips-01-001-020400" ...
## $ mhmval00 : num 76600 72900 79900 89800 116594 ...
## $ mhmval12 : num 121500 130500 118700 133500 174500 ...
## $ hinc00 : num 36685 30298 46731 46142 58886 ...
## $ hu00 : num 769 731 1263 1871 2282 ...
## $ vac00 : num 93 67 61 111 80.4 ...
## $ own00 : num 518 452 869 1390 1671 ...
## $ rent00 : num 158 212 333 370 531 ...
## $ h30old00 : num 225 329 452 979 152 ...
## $ empclf00 : num 826 722 1414 2114 2906 ...
## $ clf00 : num 872 802 1456 2191 2955 ...
## $ unemp00 : num 46 80 42 77 49.3 ...
## $ prof00 : num 221 154 438 673 1173 ...
## $ dpov00 : num 1790 1907 3262 4551 6048 ...
## $ npov00 : num 227 433 250 207 223 ...
## $ ag25up00 : num 1227 1157 2130 3072 3785 ...
## $ hs00 : num 635 740 990 1477 1257 ...
## $ col00 : num 192 170 478 708 1214 ...
## $ pop00.x : num 1921 1892 3339 4556 6054 ...
## $ nhwht00 : num 1723 671 2738 4273 5427 ...
## $ nhblk00 : num 145 1177 498 118 367 ...
## $ hisp00 : num 12 16 55 101 95.2 ...
## $ asian00 : num 8 12 27 40 113 ...
## $ cbsa : chr "33860" "33860" "33860" "33860" ...
## $ cbsaname : chr "Montgomery, AL" "Montgomery, AL" "Montgomery, AL" "Montgomery, AL" ...
## $ p.white : num 89.7 35.5 82 93.8 89.6 ...
## $ p.black : num 7.55 62.21 14.91 2.59 6.07 ...
## $ p.hisp : num 0.625 0.846 1.647 2.217 1.573 ...
## $ p.asian : num 0.416 0.634 0.809 0.878 1.867 ...
## $ p.hs : num 67.4 78.7 68.9 71.1 65.3 ...
## $ p.col : num 15.6 14.7 22.4 23 32.1 ...
## $ p.prof : num 26.8 21.3 31 31.8 40.4 ...
## $ p.unemp : num 5.28 9.98 2.88 3.51 1.67 ...
## $ p.vacant : num 12.09 9.17 4.83 5.93 3.52 ...
## $ pov.rate : num 12.68 22.71 7.66 4.55 3.69 ...
## $ mhv.00 : num 98703 93935 102955 115712 150237 ...
## $ mhv.10 : num 121500 130500 118700 133500 174500 ...
## $ mhv.change: num 22797 36565 15745 17788 24263 ...
## $ mhv.growth: num 23.1 38.9 15.3 15.4 16.2 ...
#random forest will not run if there are fields with missing values or character variables. These need to be corrected.
#mhmval12
# p.white
# p.black
# p.hisp
# p.asian
# p.hs
# p.col
# p.prof
# p.unemp
# p.vacant
# pov.rate
# mhv.00
# mhv.10
# mhv.change
# mhv.growthFor the purpose of this tutorial, we will simply replace missing values in proportions with 0’s. Median/mean replacement values could be used for values that are not proportions.
Note: other investigations and replacements may be needed depending on the data set.
#replacing NAs with 0s
d$p.white = as.numeric(d$p.white)
d$p.white[is.na(d$p.white) == TRUE] = 0
d$p.black = as.numeric(d$p.black)
d$p.black[is.na(d$p.black) == TRUE] = 0
d$p.hisp = as.numeric(d$p.hisp)
d$p.hisp[is.na(d$p.hisp) == TRUE] = 0
d$p.asian = as.numeric(d$p.asian)
d$p.asian[is.na(d$p.asian) == TRUE] = 0
d$p.hs = as.numeric(d$p.hs)
d$p.hs[is.na(d$p.hs) == TRUE] = 0
d$p.col = as.numeric(d$p.col)
d$p.col[is.na(d$p.col) == TRUE] = 0
d$p.prof = as.numeric(d$p.prof)
d$p.prof[is.na(d$p.prof) == TRUE] = 0
d$p.unemp = as.numeric(d$p.unemp)
d$p.unemp[is.na(d$p.unemp) == TRUE] = 0
d$p.vacant = as.numeric(d$p.vacant)
d$p.vacant[is.na(d$p.vacant) == TRUE] = 0
d$pov.rate = as.numeric(d$pov.rate)
d$pov.rate[is.na(d$pov.rate) == TRUE] = 0There are some variables with more that 53 (maximum for random Forest) factor levels. For this tutorial, we will remove these variables and any others that may have a biased influence on y.
Note: it may be better to simplify and combine like factor levels instead of removing them completely.
#removing column completly to reduce bias
d <- d[,!names(d) %in% c("mhmval00", "mhmval12", "mhv.growth", "mhv.00", "mhv.10", "tractid", "cbsa", "cbsaname")]
d$mhv.change = as.numeric(d$mhv.change)
d$mhv.change[is.na(d$mhv.change) == TRUE] = 0
#str(d)
#summary(d)#creating variable selection decision tree
rf1 = randomForest(d$mhv.change ~. , importance = TRUE, data = d)
#visualizing predictor variables
varImpPlot(rf1, n.var=min(9, nrow(d$importance)))
The first plot (%IncMSE) shows how much the variable impacts y. The second plot (IncNodePurity) measures the impurity index of the split tree calculation. For this tutorial, only the first plot is relevant. This shows that h30old00, p.prof, and p.vacant variables have the greatest impact on change in median home value.
Moving forward, I would recommended:
You can learn more about the random Forest package and variable importance plot here: https://cran.r-project.org/web/packages/randomForest/randomForest.pdf