Housing Picking

library(maps)  # making making 
library(mapproj)  # projections for map making
library(spgwr)  # spatially-weighted regression

## Loading required package: sp

## NOTE: This package does not constitute approval of GWR
## as a method of spatial analysis; see example(gwr)

library(rpart)  # tree-structured modeling
library(randomForest)  # random forests

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

library(rpart.plot)  # plot tree-structured model information
library(lattice)  # statistical graphics
library(cvTools)  # cross-validation tools including rmspe

## Loading required package: robustbase

# read in the housing data
houses <-  read.table("C:\\Users\\teeja\\Dropbox\\AppliedDataScience\\IST718_BigDataAnalytics\\Block1_Crawl-master\\houses_data.txt", header = FALSE, sep = "",dec = ".", row.names = NULL,col.names = c("value",                              "income", "age", "rooms", "bedrooms", "pop", "hh", "latitude", "longitude"))

head(houses)

##    value income age rooms bedrooms  pop   hh latitude longitude
## 1 452600 8.3252  41   880      129  322  126    37.88   -122.23
## 2 358500 8.3014  21  7099     1106 2401 1138    37.86   -122.22
## 3 352100 7.2574  52  1467      190  496  177    37.85   -122.24
## 4 341300 5.6431  52  1274      235  558  219    37.85   -122.25
## 5 342200 3.8462  52  1627      280  565  259    37.85   -122.25
## 6 269700 4.0368  52   919      213  413  193    37.85   -122.25

# computed variables for linear model used by Pace and Barry (1997)   
houses$log_value <- log(houses$value)  
houses$income_squared <- houses$income^2 
houses$income_cubed <- houses$income^3 
houses$log_age <- log(houses$age)
houses$log_pc_rooms <- log(houses$rooms / houses$pop)
houses$log_pc_bedrooms <- log(houses$bedrooms / houses$pop)
houses$log_pop_hh <- log(houses$pop / houses$hh)
houses$log_hh <- log(houses$hh)

# structure of the Pace and Barry (1997) model for baseline for comparisons
pace.barry.model <- {log_value ~ income + income_squared +income_cubed + log_age + log_pc_rooms + log_pc_bedrooms + 
                       log_pop_hh + log_hh}

# for comparison let's look at a simple model with the original variables
simple.model <-  {log_value ~ income + age + rooms + bedrooms +  pop + hh} 
  
# original variables plus variables that add value for trees 
# that is... variables that are not simple monotonic transformations
# of the original explanatory variables

full.model <- {log_value ~ income + age + rooms + bedrooms +pop + hh + log_pc_rooms + log_pc_bedrooms + log_pop_hh}  
  
# define variable for selecting a geographically defined
# subset of the data... San Diego area  
# we use nested ifelse statements to do this

# define the bounding box for selecting the area
# here we are selecting the San Diego region
BB.TOP <- 33
BB.BOTTOM <- 32
BB.RIGHT <- -116.75
BB.LEFT <- -125

houses$select <- ifelse(((houses$latitude < BB.TOP)),ifelse((houses$longitude < BB.RIGHT),ifelse((houses$latitude > BB.BOTTOM),
                  ifelse((houses$longitude > BB.LEFT),1,2),2),2),2)
houses$select <- factor(houses$select, levels = c(1,2), labels = c("Selected","Not Selected"))

houses.selected <- subset(houses, subset = (select == "Selected"))
houses.notselected <- subset(houses, subset = (select == "Not Selected"))

# plot the locations of block groups red in the selected area, blue otherwise
#pdf(file = "fig_spatial_map_selected_region.pdf", width = 8.5, height = 8.5)
pointsize <- 0.5
map("state", region = c("california"), project="albers",par=c(39,45)) 
  points(mapproject(houses.selected$longitude, houses.selected$latitude,
  projection=""),pch=20,cex=pointsize,col="red")
  points(mapproject(houses.notselected$longitude, houses.notselected$latitude,
  projection=""),pch=20,cex=pointsize,col="darkblue")
legend("right", legend = c("Selected Region","Not Selected"), 
  col = c("red","darkblue"), pch = 20)
map.scale()

#dev.off()

# define training and test sets for the selected houses
set.seed(4444)

partition <- sample(nrow(houses.selected)) # permuted list of row index numbers

houses.selected$Group <- ifelse((partition < nrow(houses.selected)/(3/2)),1,2)
houses.selected$Group <-  factor(houses.selected$Group,levels=c(1,2),labels=c("TRAIN","TEST"))

print(table(houses.selected$Group))  # review the split into training and test

## 
## TRAIN  TEST 
##   803   403

print(head(houses.selected))  # review the selected data

##        value income age rooms bedrooms pop  hh latitude longitude
## 14006 441700 4.7396  52  2023      301 649 285    32.76   -117.18
## 14007 408500 5.3920  52  1539      212 535 224    32.75   -117.18
## 14008 459600 8.6030  52  1504      208 518 196    32.75   -117.18
## 14009 500001 8.1548  52  1495      230 459 190    32.75   -117.19
## 14010 411600 6.1309  52  1388      213 513 211    32.75   -117.19
## 14011 500001 5.7914  52  1294      175 434 180    32.76   -117.19
##       log_value income_squared income_cubed  log_age log_pc_rooms
## 14006  12.99839       22.46381     106.4695 3.951244    1.1369041
## 14007  12.92025       29.07366     156.7652 3.951244    1.0566214
## 14008  13.03811       74.01161     636.7219 3.951244    1.0659083
## 14009  13.12237       66.50076     542.3004 3.951244    1.1808313
## 14010  12.92781       37.58793     230.4479 3.951244    0.9953433
## 14011  13.12237       33.54031     194.2454 3.951244    1.0924489
##       log_pc_bedrooms log_pop_hh   log_hh   select Group
## 14006      -0.7683225  0.8229435 5.652489 Selected  TEST
## 14007      -0.9256805  0.8706207 5.411646 Selected TRAIN
## 14008      -0.9124372  0.9718606 5.278115 Selected TRAIN
## 14009      -0.6909709  0.8820261 5.247024 Selected TRAIN
## 14010      -0.8789837  0.8884177 5.351858 Selected TRAIN
## 14011      -0.9082586  0.8800877 5.192957 Selected TRAIN

houses.train <- subset(houses.selected, subset = (Group == "TRAIN"))
houses.test <-  subset(houses.selected, subset = (Group == "TEST")) 
  
# examine the correlations across the variables before we begin modeling
houses.train.df.vars <- houses.train[,c("log_value","income","age","rooms","bedrooms","pop","hh","log_pc_rooms","log_pc_bedrooms","log_pop_hh")]

houses.train.cormat <- cor(as.matrix(houses.train.df.vars))
houses.train.cormat.line <- houses.train.cormat["log_value",]
# explanatory variables ordered by correlation with the response variable
ordered.houses.train.cormat <-  houses.train.cormat[names(sort(houses.train.cormat.line,decreasing=TRUE)),
                                names(sort(houses.train.cormat.line,decreasing=FALSE))]

# code to obtain default colors from ggplot2...
number.of.default.colors <- 2  # two end-points for our scale
end.point.colors <- hcl(h=seq(15, 375-360/number.of.default.colors,length=number.of.default.colors)%%360, c=100, l=65)
# end.point.colors[1] and [2] used to define the three-point color scale

# pdf(file = "fig_spatial_correlation_heat_map.pdf", width = 11, 
#  height = 8.5)
x <- rep(1:nrow(ordered.houses.train.cormat),times=ncol(ordered.houses.train.cormat))
y <- NULL
for (i in 1:ncol(ordered.houses.train.cormat)) 
  y <- c(y,rep(i,times=nrow(ordered.houses.train.cormat)))
# use fixed format 0.XXX in cells of correlation matrix
cortext <- sprintf("%0.3f", as.numeric(ordered.houses.train.cormat))  
text.data.frame <- data.frame(x, y, cortext)
text.data.frame$cortext <- as.character(text.data.frame$cortext)
text.data.frame$cortext <- ifelse((text.data.frame$cortext == "1.000"),
    NA,text.data.frame$cortext)  # define diagonal cells as missing
text.data.frame <- na.omit(text.data.frame)  # diagonal cells have no text

print(levelplot(ordered.houses.train.cormat, cuts = 25, tick.number = 9,
  col.regions = 
    colorRampPalette(c(end.point.colors[1], "white", end.point.colors[2])),
  scales=list(tck=0, x=list(rot=90)),
  xlab = "", 
  ylab = "",
  panel = function(...) {
    panel.levelplot(...)  
    panel.text(text.data.frame$x, text.data.frame$y, 
    labels = text.data.frame$cortext)
    }))

#dev.off()

# scatter plot matrix (splom) demonstration
houses.train.splom.vars <- houses.train[,c("log_value","income","age","rooms")]
# pdf(file = "fig_spatial_sample_splom.pdf", width = 8.5, 
#  height = 8.5)
pairs(houses.train.splom.vars, cex = 0.65, col = "darkblue")

#dev.off()

# define spatial objects as needed for spatial modeling work 
# explanation of spatial objects may be found in chapter 2 of
# Bivand, R. S., Pebesma, E. J., and Gomez-Rubio, V. (2008) 
# Applied Spatial Data Analysis, New York: Springer.
# this involves adding coordinate objects to data frame objects
# training set coordinates to add
houses.coord <- cbind(houses.train$longitude,houses.train$latitude) 
# define spatial points data frame object
houses.train <- SpatialPointsDataFrame(houses.coord,houses.train,bbox = NULL) 

# test set coordinates to add
houses.coord <- cbind(houses.test$longitude,houses.test$latitude) 
# define spatial points data frame object
houses.test <- SpatialPointsDataFrame(houses.coord,houses.test,bbox = NULL) 

# examine the struction of the spatial points data frame
print(str(houses.train))

## Formal class 'SpatialPointsDataFrame' [package "sp"] with 5 slots
##   ..@ data       :'data.frame':  803 obs. of  19 variables:
##   .. ..$ value          : num [1:803] 408500 459600 500001 411600 500001 ...
##   .. ..$ income         : num [1:803] 5.39 8.6 8.15 6.13 5.79 ...
##   .. ..$ age            : num [1:803] 52 52 52 52 52 36 23 49 52 21 ...
##   .. ..$ rooms          : num [1:803] 1539 1504 1495 1388 1294 ...
##   .. ..$ bedrooms       : num [1:803] 212 208 230 213 175 534 594 494 262 486 ...
##   .. ..$ pop            : num [1:803] 535 518 459 513 434 ...
##   .. ..$ hh             : num [1:803] 224 196 190 211 180 531 536 419 249 482 ...
##   .. ..$ latitude       : num [1:803] 32.8 32.8 32.8 32.8 32.8 ...
##   .. ..$ longitude      : num [1:803] -117 -117 -117 -117 -117 ...
##   .. ..$ log_value      : num [1:803] 12.9 13 13.1 12.9 13.1 ...
##   .. ..$ income_squared : num [1:803] 29.1 74 66.5 37.6 33.5 ...
##   .. ..$ income_cubed   : num [1:803] 157 637 542 230 194 ...
##   .. ..$ log_age        : num [1:803] 3.95 3.95 3.95 3.95 3.95 ...
##   .. ..$ log_pc_rooms   : num [1:803] 1.057 1.066 1.181 0.995 1.092 ...
##   .. ..$ log_pc_bedrooms: num [1:803] -0.926 -0.912 -0.691 -0.879 -0.908 ...
##   .. ..$ log_pop_hh     : num [1:803] 0.871 0.972 0.882 0.888 0.88 ...
##   .. ..$ log_hh         : num [1:803] 5.41 5.28 5.25 5.35 5.19 ...
##   .. ..$ select         : Factor w/ 2 levels "Selected","Not Selected": 1 1 1 1 1 1 1 1 1 1 ...
##   .. ..$ Group          : Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
##   ..@ coords.nrs : num(0) 
##   ..@ coords     : num [1:803, 1:2] -117 -117 -117 -117 -117 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:2] "coords.x1" "coords.x2"
##   ..@ bbox       : num [1:2, 1:2] -117.3 32.5 -116.8 33
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : chr [1:2] "coords.x1" "coords.x2"
##   .. .. ..$ : chr [1:2] "min" "max"
##   ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
##   .. .. ..@ projargs: chr NA
## NULL

# --------------------------------------------
# Linear regression a la Pace and Barry (1997)
# --------------------------------------------
pace.barry.train.fit <- lm(pace.barry.model, data = houses.train)

print(pace.barry.train.fit)

## 
## Call:
## lm(formula = pace.barry.model, data = houses.train)
## 
## Coefficients:
##     (Intercept)           income   income_squared     income_cubed  
##       11.328534         0.109107         0.017346        -0.001131  
##         log_age     log_pc_rooms  log_pc_bedrooms       log_pop_hh  
##        0.105285         0.141457        -0.028180        -0.546219  
##          log_hh  
##        0.038683

print(summary(pace.barry.train.fit))

## 
## Call:
## lm(formula = pace.barry.model, data = houses.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.41721 -0.16599 -0.02227  0.13766  1.03596 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     11.3285340  0.1714460  66.076  < 2e-16 ***
## income           0.1091066  0.0468096   2.331  0.02001 *  
## income_squared   0.0173457  0.0077151   2.248  0.02483 *  
## income_cubed    -0.0011315  0.0003904  -2.898  0.00386 ** 
## log_age          0.1052854  0.0208900   5.040 5.77e-07 ***
## log_pc_rooms     0.1414570  0.0777626   1.819  0.06927 .  
## log_pc_bedrooms -0.0281798  0.1217779  -0.231  0.81706    
## log_pop_hh      -0.5462186  0.1259003  -4.339 1.62e-05 ***
## log_hh           0.0386832  0.0173577   2.229  0.02612 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.262 on 794 degrees of freedom
## Multiple R-squared:  0.6476, Adjusted R-squared:  0.6441 
## F-statistic: 182.4 on 8 and 794 DF,  p-value: < 2.2e-16

# direct calculation of root-mean-squared prediction error 
# obtained directly on the training data
print(rmspe(houses.train$log_value, predict(pace.barry.train.fit)))

## [1] 0.2605285

# report R-squared on training data
print(cor(houses.train$log_value,predict(pace.barry.train.fit))^2)

## [1] 0.647627

cat("\n\nTraining set proportion of variance accounted",
  " for by linear regression = ",
  sprintf("%1.3f",cor(houses.train$log_value,
  predict(pace.barry.train.fit))^2),sep=" ")

## 
## 
## Training set proportion of variance accounted  for by linear regression =  0.648

# test model fit to training set on the test set
print(rmspe(houses.test$log_value, predict(pace.barry.train.fit, 
  newdata = houses.test)))

## [1] 0.2968635

print(cor(houses.test$log_value,
  predict(pace.barry.train.fit, newdata = houses.test))^2)

## [1] 0.5622463

cat("\n\nTest set proportion of variance accounted",
  " for by linear regression = ",
  sprintf("%1.3f",cor(houses.test$log_value,
  predict(pace.barry.train.fit, newdata = houses.test))^2),sep=" ")

## 
## 
## Test set proportion of variance accounted  for by linear regression =  0.562

# demonstrate cross-validation within the training set
# specify ten-fold cross-validation within the training set
# K = folds   R = replications of K-fold cross-validation
set.seed(1234)  # for reproducibility
folds <- cvFolds(nrow(houses.train), K = 10, R = 50)  
cv.pace.barry.train.fit <- cvLm(pace.barry.train.fit, cost = rtmspe, folds = folds, trim = 0.1)
# root-mean-squared prediction error estimated by cross-validation
print(cv.pace.barry.train.fit)

## 10-fold CV results:
##        CV 
## 0.1822949

# --------------------------------------
# Tree-structured regression (simple)
# --------------------------------------
# try tree-structured regression on the original explantory variables
# note that one of the advantages of trees is no need for transformations
# of the explanatory variables 
rpart.train.fit <- rpart(simple.model, data = houses.train)
print(summary(rpart.train.fit))  # tree summary statistics and split detail

## Call:
## rpart(formula = simple.model, data = houses.train)
##   n= 803 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.29410366      0 1.0000000 1.0019589 0.05631528
## 2 0.14976880      1 0.7058963 0.7170898 0.04570448
## 3 0.02706349      2 0.5561275 0.5737357 0.03610174
## 4 0.02349265      3 0.5290640 0.5737048 0.03691744
## 5 0.01741974      5 0.4820787 0.5386605 0.03596446
## 6 0.01324194      9 0.4123998 0.5217198 0.03429556
## 7 0.01000000     10 0.3991578 0.5177536 0.03469061
## 
## Variable importance
##   income      age      pop    rooms bedrooms       hh 
##       69        7        7        6        5        5 
## 
## Node number 1: 803 observations,    complexity param=0.2941037
##   mean=12.05081, MSE=0.192623 
##   left son=2 (696 obs) right son=3 (107 obs)
##   Primary splits:
##       income   < 5.32885 to the left,  improve=0.29410370, (0 missing)
##       rooms    < 1024    to the left,  improve=0.06578183, (0 missing)
##       pop      < 1183.5  to the right, improve=0.04183966, (0 missing)
##       bedrooms < 77.5    to the left,  improve=0.01754284, (0 missing)
##       hh       < 83.5    to the left,  improve=0.01586739, (0 missing)
##   Surrogate splits:
##       rooms    < 15925.5 to the left,  agree=0.870, adj=0.028, (0 split)
##       pop      < 7643.5  to the left,  agree=0.869, adj=0.019, (0 split)
##       bedrooms < 2672    to the left,  agree=0.868, adj=0.009, (0 split)
## 
## Node number 2: 696 observations,    complexity param=0.1497688
##   mean=11.95748, MSE=0.1406436 
##   left son=4 (166 obs) right son=5 (530 obs)
##   Primary splits:
##       income   < 2.2561  to the left,  improve=0.23665500, (0 missing)
##       rooms    < 1069    to the left,  improve=0.07205139, (0 missing)
##       pop      < 1181    to the right, improve=0.04230318, (0 missing)
##       hh       < 240.5   to the left,  improve=0.02710874, (0 missing)
##       bedrooms < 255.5   to the left,  improve=0.02186221, (0 missing)
##   Surrogate splits:
##       rooms    < 981.5   to the left,  agree=0.777, adj=0.066, (0 split)
##       age      < 46.5    to the right, agree=0.773, adj=0.048, (0 split)
##       bedrooms < 23      to the left,  agree=0.764, adj=0.012, (0 split)
##       pop      < 67.5    to the left,  agree=0.764, adj=0.012, (0 split)
##       hh       < 28.5    to the left,  agree=0.764, adj=0.012, (0 split)
## 
## Node number 3: 107 observations,    complexity param=0.02706349
##   mean=12.65785, MSE=0.1055836 
##   left son=6 (71 obs) right son=7 (36 obs)
##   Primary splits:
##       income   < 6.81245 to the left,  improve=0.37053310, (0 missing)
##       age      < 33.5    to the left,  improve=0.10686860, (0 missing)
##       pop      < 735.5   to the right, improve=0.08459233, (0 missing)
##       bedrooms < 1287.5  to the right, improve=0.03635599, (0 missing)
##       hh       < 267.5   to the right, improve=0.03145908, (0 missing)
##   Surrogate splits:
##       hh < 200.5   to the right, agree=0.673, adj=0.028, (0 split)
## 
## Node number 4: 166 observations,    complexity param=0.01324194
##   mean=11.63149, MSE=0.1015664 
##   left son=8 (18 obs) right son=9 (148 obs)
##   Primary splits:
##       income   < 1.2967  to the left,  improve=0.12148340, (0 missing)
##       rooms    < 945.5   to the left,  improve=0.08376565, (0 missing)
##       hh       < 396.5   to the left,  improve=0.06708615, (0 missing)
##       bedrooms < 325.5   to the left,  improve=0.05926134, (0 missing)
##       pop      < 824     to the right, improve=0.04361732, (0 missing)
## 
## Node number 5: 530 observations,    complexity param=0.02349265
##   mean=12.05958, MSE=0.109174 
##   left son=10 (370 obs) right son=11 (160 obs)
##   Primary splits:
##       income   < 4.1186  to the left,  improve=0.057377380, (0 missing)
##       age      < 38.5    to the left,  improve=0.052121850, (0 missing)
##       pop      < 783.5   to the right, improve=0.048423740, (0 missing)
##       rooms    < 733.5   to the left,  improve=0.029877300, (0 missing)
##       bedrooms < 195     to the left,  improve=0.008404641, (0 missing)
##   Surrogate splits:
##       rooms    < 6703.5  to the left,  agree=0.713, adj=0.050, (0 split)
##       age      < 16.5    to the right, agree=0.711, adj=0.044, (0 split)
##       pop      < 4120    to the left,  agree=0.708, adj=0.031, (0 split)
##       hh       < 1664    to the left,  agree=0.704, adj=0.019, (0 split)
##       bedrooms < 1335    to the left,  agree=0.702, adj=0.012, (0 split)
## 
## Node number 6: 71 observations
##   mean=12.517, MSE=0.07368271 
## 
## Node number 7: 36 observations
##   mean=12.93562, MSE=0.05221936 
## 
## Node number 8: 18 observations
##   mean=11.31298, MSE=0.1497932 
## 
## Node number 9: 148 observations
##   mean=11.67023, MSE=0.08186173 
## 
## Node number 10: 370 observations,    complexity param=0.01741974
##   mean=12.00754, MSE=0.1067343 
##   left son=20 (183 obs) right son=21 (187 obs)
##   Primary splits:
##       pop    < 1134.5  to the right, improve=0.06616804, (0 missing)
##       age    < 17.5    to the left,  improve=0.04097617, (0 missing)
##       rooms  < 733.5   to the left,  improve=0.03624431, (0 missing)
##       hh     < 236     to the left,  improve=0.02571397, (0 missing)
##       income < 2.78415 to the left,  improve=0.01964311, (0 missing)
##   Surrogate splits:
##       rooms    < 2488    to the right, agree=0.862, adj=0.721, (0 split)
##       hh       < 426     to the right, agree=0.827, adj=0.650, (0 split)
##       bedrooms < 499     to the right, agree=0.808, adj=0.612, (0 split)
##       age      < 19.5    to the left,  agree=0.654, adj=0.301, (0 split)
##       income   < 3.27555 to the right, agree=0.559, adj=0.109, (0 split)
## 
## Node number 11: 160 observations,    complexity param=0.02349265
##   mean=12.17994, MSE=0.09406603 
##   left son=22 (151 obs) right son=23 (9 obs)
##   Primary splits:
##       age    < 38.5    to the left,  improve=0.26228430, (0 missing)
##       pop    < 1301    to the right, improve=0.06466240, (0 missing)
##       income < 5.04175 to the left,  improve=0.01778715, (0 missing)
##       hh     < 281.5   to the right, improve=0.01444560, (0 missing)
##       rooms  < 1706.5  to the right, improve=0.01114705, (0 missing)
## 
## Node number 20: 183 observations
##   mean=11.92259, MSE=0.06587219 
## 
## Node number 21: 187 observations,    complexity param=0.01741974
##   mean=12.09067, MSE=0.1327486 
##   left son=42 (79 obs) right son=43 (108 obs)
##   Primary splits:
##       bedrooms < 334.5   to the left,  improve=0.09235448, (0 missing)
##       hh       < 236     to the left,  improve=0.08679052, (0 missing)
##       rooms    < 733.5   to the left,  improve=0.08668065, (0 missing)
##       age      < 17.5    to the left,  improve=0.07793659, (0 missing)
##       pop      < 361.5   to the left,  improve=0.03213004, (0 missing)
##   Surrogate splits:
##       hh     < 310     to the left,  agree=0.930, adj=0.835, (0 split)
##       rooms  < 1518.5  to the left,  agree=0.866, adj=0.684, (0 split)
##       pop    < 704.5   to the left,  agree=0.759, adj=0.430, (0 split)
##       income < 3.45665 to the right, agree=0.626, adj=0.114, (0 split)
##       age    < 51      to the right, agree=0.583, adj=0.013, (0 split)
## 
## Node number 22: 151 observations
##   mean=12.14159, MSE=0.07145599 
## 
## Node number 23: 9 observations
##   mean=12.82332, MSE=0.03479818 
## 
## Node number 42: 79 observations
##   mean=11.96121, MSE=0.08618303 
## 
## Node number 43: 108 observations,    complexity param=0.01741974
##   mean=12.18537, MSE=0.1455826 
##   left son=86 (9 obs) right son=87 (99 obs)
##   Primary splits:
##       age      < 16.5    to the left,  improve=0.15587770, (0 missing)
##       pop      < 781     to the right, improve=0.14919700, (0 missing)
##       rooms    < 2378.5  to the left,  improve=0.05757607, (0 missing)
##       bedrooms < 351.5   to the right, improve=0.03580898, (0 missing)
##       hh       < 573     to the left,  improve=0.01978496, (0 missing)
##   Surrogate splits:
##       bedrooms < 660.5   to the right, agree=0.935, adj=0.222, (0 split)
## 
## Node number 86: 9 observations
##   mean=11.68575, MSE=0.09491246 
## 
## Node number 87: 99 observations,    complexity param=0.01741974
##   mean=12.23079, MSE=0.1254329 
##   left son=174 (76 obs) right son=175 (23 obs)
##   Primary splits:
##       pop      < 781     to the right, improve=0.27550090, (0 missing)
##       rooms    < 1545    to the right, improve=0.05738988, (0 missing)
##       hh       < 573     to the left,  improve=0.05195104, (0 missing)
##       bedrooms < 612     to the left,  improve=0.04391211, (0 missing)
##       income   < 2.91    to the left,  improve=0.04362614, (0 missing)
##   Surrogate splits:
##       hh       < 340     to the right, agree=0.859, adj=0.391, (0 split)
##       rooms    < 1545    to the right, agree=0.828, adj=0.261, (0 split)
##       bedrooms < 357     to the right, agree=0.788, adj=0.087, (0 split)
## 
## Node number 174: 76 observations
##   mean=12.12853, MSE=0.08623869 
## 
## Node number 175: 23 observations
##   mean=12.56871, MSE=0.1061994 
## 
## n= 803 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##   1) root 803 154.6762000 12.05081  
##     2) income< 5.32885 696  97.8879400 11.95748  
##       4) income< 2.2561 166  16.8600300 11.63149  
##         8) income< 1.2967 18   2.6962780 11.31298 *
##         9) income>=1.2967 148  12.1155400 11.67023 *
##       5) income>=2.2561 530  57.8622300 12.05958  
##        10) income< 4.1186 370  39.4916900 12.00754  
##          20) pop>=1134.5 183  12.0546100 11.92259 *
##          21) pop< 1134.5 187  24.8239900 12.09067  
##            42) bedrooms< 334.5 79   6.8084590 11.96121 *
##            43) bedrooms>=334.5 108  15.7229200 12.18537  
##              86) age< 16.5 9   0.8542122 11.68575 *
##              87) age>=16.5 99  12.4178600 12.23079  
##               174) pop>=781 76   6.5541410 12.12853 *
##               175) pop< 781 23   2.4425850 12.56871 *
##        11) income>=4.1186 160  15.0505600 12.17994  
##          22) age< 38.5 151  10.7898500 12.14159 *
##          23) age>=38.5 9   0.3131836 12.82332 *
##     3) income>=5.32885 107  11.2974500 12.65785  
##       6) income< 6.81245 71   5.2314720 12.51700 *
##       7) income>=6.81245 36   1.8798970 12.93562 *

houses.train$rpart.train.fit.pred <- predict(rpart.train.fit, data = houses.train)

# root-mean-squared for trees on training set
print(rmspe(houses.train$log_value, houses.train$rpart.train.fit.pred))

## [1] 0.277285

# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rpart.train.fit.pred)^2)

## [1] 0.6008422

cat("\n\nTraining set proportion of variance accounted",
  " for by tree-structured regression = ",
  sprintf("%1.3f",cor(houses.train$log_value,
  houses.train$rpart.train.fit.pred)^2),sep=" ")

## 
## 
## Training set proportion of variance accounted  for by tree-structured regression =  0.601

# root-mean-squared for trees on test set
houses.test$rpart.train.fit.pred <- predict(rpart.train.fit, newdata = houses.test)
print(rmspe(houses.test$log_value, houses.test$rpart.train.fit.pred))

## [1] 0.3478524

# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rpart.train.fit.pred)^2)

## [1] 0.4037499

cat("\n\nTest set proportion of variance accounted",
  " for by tree-structured regression = ",
  sprintf("%1.3f",
  cor(houses.test$log_value,houses.test$rpart.train.fit.pred)^2),sep=" ")

## 
## 
## Test set proportion of variance accounted  for by tree-structured regression =  0.404

# plot the regression tree result from rpart
#pdf(file = "fig_spatial_rpart_model.pdf", width = 8.5, height = 8.5)
prp(rpart.train.fit, main="",
  digits = 3,  # digits to display in terminal nodes
  nn = TRUE,  # display the node numbers
  fallen.leaves = TRUE,  # put the leaves on the bottom of the page
  branch = 0.5,  # change angle of branch lines
  branch.lwd = 2,  # width of branch lines
  faclen = 0,  # do not abbreviate factor levels
  trace = 1,  # print the automatically calculated cex
  shadow.col = 0,  # no shadows under the leaves
  branch.lty = 1,  # draw branches using dotted lines
  split.cex = 1.2,  # make the split text larger than the node text
  split.prefix = "is ",  # put "is " before split text
  split.suffix = "?",  # put "?" after split text
  split.box.col = "blue",  # lightgray split boxes (default is white)
  split.col = "white",  # color of text in split box 
  split.border.col = "blue",  # darkgray border on split boxes
  split.round = .25)  # round the split box corners a tad

## cex 1   xlim c(0, 1)   ylim c(0, 1)

#dev.off()

# --------------------------------------
# Tree-structured regression (full)
# --------------------------------------
# try tree-structured regression on the expanded set of variables 
rpart.train.fit.full <- rpart(full.model, data = houses.train)
print(summary(rpart.train.fit.full))  # tree summary statistics and split detail

## Call:
## rpart(formula = full.model, data = houses.train)
##   n= 803 
## 
##            CP nsplit rel error    xerror       xstd
## 1  0.36957978      0 1.0000000 1.0018854 0.05623900
## 2  0.09253626      1 0.6304202 0.6584266 0.04000090
## 3  0.07954998      2 0.5378840 0.5860452 0.03564882
## 4  0.03441113      3 0.4583340 0.5015991 0.03371739
## 5  0.02944949      4 0.4239228 0.4951772 0.03375549
## 6  0.01801253      5 0.3944734 0.4654073 0.03219833
## 7  0.01597595      6 0.3764608 0.4325397 0.03064742
## 8  0.01159800      8 0.3445089 0.4223959 0.02909689
## 9  0.01052854      9 0.3329109 0.4020443 0.02743957
## 10 0.01000000     10 0.3223824 0.3911207 0.02648231
## 
## Variable importance
##    log_pc_rooms          income      log_pop_hh log_pc_bedrooms 
##              42              26              12              10 
##             age             pop           rooms        bedrooms 
##               4               3               2               1 
##              hh 
##               1 
## 
## Node number 1: 803 observations,    complexity param=0.3695798
##   mean=12.05081, MSE=0.192623 
##   left son=2 (543 obs) right son=3 (260 obs)
##   Primary splits:
##       log_pc_rooms    < 0.776981   to the left,  improve=0.36957980, (0 missing)
##       income          < 5.32885    to the left,  improve=0.29410370, (0 missing)
##       log_pop_hh      < 0.9770098  to the right, improve=0.10676160, (0 missing)
##       log_pc_bedrooms < -1.003109  to the left,  improve=0.08702279, (0 missing)
##       rooms           < 1024       to the left,  improve=0.06578183, (0 missing)
##   Surrogate splits:
##       income          < 5.3914     to the left,  agree=0.745, adj=0.212, (0 split)
##       log_pop_hh      < 0.8957548  to the right, agree=0.724, adj=0.146, (0 split)
##       log_pc_bedrooms < -0.7291228 to the left,  agree=0.712, adj=0.112, (0 split)
##       pop             < 659.5      to the right, agree=0.695, adj=0.058, (0 split)
##       age             < 43.5       to the left,  agree=0.681, adj=0.015, (0 split)
## 
## Node number 2: 543 observations,    complexity param=0.09253626
##   mean=11.86618, MSE=0.1041501 
##   left son=4 (207 obs) right son=5 (336 obs)
##   Primary splits:
##       income       < 2.56845    to the left,  improve=0.25309060, (0 missing)
##       log_pc_rooms < 0.5121832  to the left,  improve=0.25271850, (0 missing)
##       rooms        < 1069       to the left,  improve=0.07670465, (0 missing)
##       log_pop_hh   < 1.15877    to the right, improve=0.05329798, (0 missing)
##       age          < 36.5       to the right, improve=0.04997564, (0 missing)
##   Surrogate splits:
##       log_pc_rooms    < 0.301507   to the left,  agree=0.740, adj=0.319, (0 split)
##       age             < 38.5       to the right, agree=0.681, adj=0.164, (0 split)
##       log_pc_bedrooms < -0.7598571 to the right, agree=0.659, adj=0.106, (0 split)
##       log_pop_hh      < 0.8096456  to the left,  agree=0.657, adj=0.101, (0 split)
##       rooms           < 1289       to the left,  agree=0.656, adj=0.097, (0 split)
## 
## Node number 3: 260 observations,    complexity param=0.07954998
##   mean=12.43639, MSE=0.1575289 
##   left son=6 (196 obs) right son=7 (64 obs)
##   Primary splits:
##       income       < 5.7841     to the left,  improve=0.30042080, (0 missing)
##       log_pc_rooms < 0.9569221  to the left,  improve=0.16638650, (0 missing)
##       log_pop_hh   < 0.6512064  to the left,  improve=0.08148162, (0 missing)
##       bedrooms     < 533.5      to the right, improve=0.04063897, (0 missing)
##       hh           < 275        to the right, improve=0.03908611, (0 missing)
##   Surrogate splits:
##       log_pc_bedrooms < -1.05175   to the right, agree=0.823, adj=0.281, (0 split)
##       log_pop_hh      < 1.118858   to the left,  agree=0.815, adj=0.250, (0 split)
##       log_pc_rooms    < 1.065515   to the left,  agree=0.812, adj=0.234, (0 split)
##       age             < 10.5       to the right, agree=0.769, adj=0.063, (0 split)
##       rooms           < 5822.5     to the left,  agree=0.762, adj=0.031, (0 split)
## 
## Node number 4: 207 observations,    complexity param=0.03441113
##   mean=11.65933, MSE=0.09056042 
##   left son=8 (132 obs) right son=9 (75 obs)
##   Primary splits:
##       log_pop_hh      < 0.9273377  to the right, improve=0.2839316, (0 missing)
##       log_pc_rooms    < 0.2758711  to the left,  improve=0.2568647, (0 missing)
##       log_pc_bedrooms < -1.002417  to the left,  improve=0.2354202, (0 missing)
##       income          < 1.2967     to the left,  improve=0.1453733, (0 missing)
##       rooms           < 1058       to the left,  improve=0.1212385, (0 missing)
##   Surrogate splits:
##       log_pc_bedrooms < -0.8692357 to the left,  agree=0.957, adj=0.880, (0 split)
##       log_pc_rooms    < 0.4376137  to the left,  agree=0.894, adj=0.707, (0 split)
##       pop             < 884.5      to the right, agree=0.671, adj=0.093, (0 split)
##       bedrooms        < 1342       to the left,  agree=0.657, adj=0.053, (0 split)
##       hh              < 1221.5     to the left,  agree=0.657, adj=0.053, (0 split)
## 
## Node number 5: 336 observations,    complexity param=0.02944949
##   mean=11.99361, MSE=0.06992364 
##   left son=10 (315 obs) right son=11 (21 obs)
##   Primary splits:
##       income       < 5.53935    to the left,  improve=0.19388220, (0 missing)
##       log_pc_rooms < 0.6163027  to the left,  improve=0.16945780, (0 missing)
##       age          < 6.5        to the right, improve=0.05711924, (0 missing)
##       rooms        < 733.5      to the left,  improve=0.04398671, (0 missing)
##       log_pop_hh   < 0.8972743  to the right, improve=0.03393714, (0 missing)
##   Surrogate splits:
##       rooms    < 15925.5    to the left,  agree=0.943, adj=0.095, (0 split)
##       pop      < 7643.5     to the left,  agree=0.943, adj=0.095, (0 split)
##       bedrooms < 2672       to the left,  agree=0.940, adj=0.048, (0 split)
## 
## Node number 6: 196 observations,    complexity param=0.01597595
##   mean=12.31208, MSE=0.119794 
##   left son=12 (55 obs) right son=13 (141 obs)
##   Primary splits:
##       age             < 20.5       to the left,  improve=0.10308650, (0 missing)
##       income          < 2.23565    to the left,  improve=0.06284607, (0 missing)
##       log_pc_rooms    < 0.9569221  to the left,  improve=0.04314497, (0 missing)
##       log_pc_bedrooms < -0.3007574 to the right, improve=0.02862553, (0 missing)
##       log_pop_hh      < 0.6512064  to the left,  improve=0.02666760, (0 missing)
##   Surrogate splits:
##       bedrooms        < 916.5      to the right, agree=0.755, adj=0.127, (0 split)
##       hh              < 859        to the right, agree=0.745, adj=0.091, (0 split)
##       pop             < 2103.5     to the right, agree=0.740, adj=0.073, (0 split)
##       log_pc_bedrooms < -0.4614824 to the right, agree=0.740, adj=0.073, (0 split)
##       rooms           < 4986.5     to the right, agree=0.735, adj=0.055, (0 split)
## 
## Node number 7: 64 observations
##   mean=12.81709, MSE=0.08083434 
## 
## Node number 8: 132 observations,    complexity param=0.01052854
##   mean=11.53846, MSE=0.06584593 
##   left son=16 (60 obs) right son=17 (72 obs)
##   Primary splits:
##       income       < 1.82915    to the left,  improve=0.1873652, (0 missing)
##       rooms        < 1282.5     to the left,  improve=0.1694186, (0 missing)
##       log_pc_rooms < -0.1411264 to the left,  improve=0.1398044, (0 missing)
##       hh           < 380.5      to the left,  improve=0.1392920, (0 missing)
##       age          < 47.5       to the right, improve=0.1168654, (0 missing)
##   Surrogate splits:
##       log_pc_rooms < 0.1618422  to the left,  agree=0.712, adj=0.367, (0 split)
##       rooms        < 1244       to the left,  agree=0.674, adj=0.283, (0 split)
##       bedrooms     < 330.5      to the left,  agree=0.636, adj=0.200, (0 split)
##       hh           < 353        to the left,  agree=0.636, adj=0.200, (0 split)
##       log_pop_hh   < 1.249465   to the right, agree=0.629, adj=0.183, (0 split)
## 
## Node number 9: 75 observations
##   mean=11.87206, MSE=0.06309013 
## 
## Node number 10: 315 observations,    complexity param=0.01801253
##   mean=11.96355, MSE=0.05745312 
##   left son=20 (177 obs) right son=21 (138 obs)
##   Primary splits:
##       log_pc_rooms    < 0.6163027  to the left,  improve=0.15394810, (0 missing)
##       log_pop_hh      < 1.157636   to the right, improve=0.07602275, (0 missing)
##       log_pc_bedrooms < -1.190874  to the left,  improve=0.06618601, (0 missing)
##       income          < 4.4952     to the left,  improve=0.05620627, (0 missing)
##       rooms           < 733.5      to the left,  improve=0.04738127, (0 missing)
##   Surrogate splits:
##       log_pop_hh      < 1.107112   to the right, agree=0.759, adj=0.449, (0 split)
##       log_pc_bedrooms < -1.020448  to the left,  agree=0.746, adj=0.420, (0 split)
##       age             < 34.5       to the left,  agree=0.594, adj=0.072, (0 split)
##       income          < 4.29695    to the left,  agree=0.584, adj=0.051, (0 split)
##       rooms           < 3512.5     to the left,  agree=0.581, adj=0.043, (0 split)
## 
## Node number 11: 21 observations
##   mean=12.44456, MSE=0.04007016 
## 
## Node number 12: 55 observations,    complexity param=0.011598
##   mean=12.13415, MSE=0.09697877 
##   left son=24 (15 obs) right son=25 (40 obs)
##   Primary splits:
##       income          < 3.00695    to the left,  improve=0.33633130, (0 missing)
##       log_pop_hh      < 0.5150801  to the left,  improve=0.19544110, (0 missing)
##       log_pc_bedrooms < -0.4318298 to the right, improve=0.10486650, (0 missing)
##       age             < 15.5       to the right, improve=0.08412380, (0 missing)
##       rooms           < 4261       to the left,  improve=0.06709558, (0 missing)
##   Surrogate splits:
##       log_pop_hh      < 0.5470277  to the left,  agree=0.873, adj=0.533, (0 split)
##       bedrooms        < 1497       to the right, agree=0.782, adj=0.200, (0 split)
##       hh              < 1271.5     to the right, agree=0.782, adj=0.200, (0 split)
##       log_pc_rooms    < 0.7841115  to the left,  agree=0.764, adj=0.133, (0 split)
##       log_pc_bedrooms < -0.5222446 to the right, agree=0.764, adj=0.133, (0 split)
## 
## Node number 13: 141 observations,    complexity param=0.01597595
##   mean=12.38149, MSE=0.1115274 
##   left son=26 (115 obs) right son=27 (26 obs)
##   Primary splits:
##       log_pc_rooms    < 0.9391865  to the left,  improve=0.16036310, (0 missing)
##       log_pc_bedrooms < -0.7908979 to the left,  improve=0.09001682, (0 missing)
##       log_pop_hh      < 0.924853   to the right, improve=0.07685060, (0 missing)
##       pop             < 789        to the right, improve=0.04200630, (0 missing)
##       income          < 2.30285    to the left,  improve=0.03826700, (0 missing)
##   Surrogate splits:
##       log_pop_hh      < 0.4281059  to the right, agree=0.837, adj=0.115, (0 split)
##       log_pc_bedrooms < -0.3760315 to the left,  agree=0.830, adj=0.077, (0 split)
##       pop             < 387        to the right, agree=0.823, adj=0.038, (0 split)
##       hh              < 156        to the right, agree=0.823, adj=0.038, (0 split)
## 
## Node number 16: 60 observations
##   mean=11.41679, MSE=0.0717158 
## 
## Node number 17: 72 observations
##   mean=11.63986, MSE=0.0383361 
## 
## Node number 20: 177 observations
##   mean=11.88051, MSE=0.03836444 
## 
## Node number 21: 138 observations
##   mean=12.07006, MSE=0.06174722 
## 
## Node number 24: 15 observations
##   mean=11.83923, MSE=0.1014901 
## 
## Node number 25: 40 observations
##   mean=12.24475, MSE=0.05043865 
## 
## Node number 26: 115 observations
##   mean=12.3179, MSE=0.08632885 
## 
## Node number 27: 26 observations
##   mean=12.66274, MSE=0.1259914 
## 
## n= 803 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 803 154.6762000 12.05081  
##    2) log_pc_rooms< 0.776981 543  56.5535100 11.86618  
##      4) income< 2.56845 207  18.7460100 11.65933  
##        8) log_pop_hh>=0.9273377 132   8.6916620 11.53846  
##         16) income< 1.82915 60   4.3029480 11.41679 *
##         17) income>=1.82915 72   2.7601990 11.63986 *
##        9) log_pop_hh< 0.9273377 75   4.7317600 11.87206 *
##      5) income>=2.56845 336  23.4943400 11.99361  
##       10) income< 5.53935 315  18.0977300 11.96355  
##         20) log_pc_rooms< 0.6163027 177   6.7905050 11.88051 *
##         21) log_pc_rooms>=0.6163027 138   8.5211160 12.07006 *
##       11) income>=5.53935 21   0.8414734 12.44456 *
##    3) log_pc_rooms>=0.776981 260  40.9575100 12.43639  
##      6) income< 5.7841 196  23.4796300 12.31208  
##       12) age< 20.5 55   5.3338320 12.13415  
##         24) income< 3.00695 15   1.5223520 11.83923 *
##         25) income>=3.00695 40   2.0175460 12.24475 *
##       13) age>=20.5 141  15.7253600 12.38149  
##         26) log_pc_rooms< 0.9391865 115   9.9278180 12.31790 *
##         27) log_pc_rooms>=0.9391865 26   3.2757770 12.66274 *
##      7) income>=5.7841 64   5.1733980 12.81709 *

houses.train$rpart.train.fit.full.pred <- 
  predict(rpart.train.fit.full, data = houses.train)

# root-mean-squared for trees on training set
print(rmspe(houses.train$log_value, houses.train$rpart.train.fit.full.pred))

## [1] 0.2491952

# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rpart.train.fit.full.pred)^2)

## [1] 0.6776176

cat("\n\nTraining set proportion of variance accounted",
   " for by tree-structured regression (full model) = ",
  sprintf("%1.3f",cor(houses.train$log_value,
  houses.train$rpart.train.fit.full.pred)^2),sep=" ")

## 
## 
## Training set proportion of variance accounted  for by tree-structured regression (full model) =  0.678

# root-mean-squared for trees on test set
houses.test$rpart.train.fit.full.pred <- predict(rpart.train.fit.full, 
  newdata = houses.test)
print(rmspe(houses.test$log_value, houses.test$rpart.train.fit.full.pred))

## [1] 0.2924421

# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rpart.train.fit.full.pred)^2)

## [1] 0.5736942

cat("\n\nTest set proportion of variance accounted",
    " for by tree-structured regression (full model) = ",
  sprintf("%1.3f",cor(houses.test$log_value,
  houses.test$rpart.train.fit.full.pred)^2),sep=" ")

## 
## 
## Test set proportion of variance accounted  for by tree-structured regression (full model) =  0.574

# plot the regression tree result from rpart
#pdf(file = "fig_spatial_rpart_model_full.pdf", width = 8.5, height = 8.5)
prp(rpart.train.fit.full, main="",
  digits = 3,  # digits to display in terminal nodes
  nn = TRUE,  # display the node numbers
  fallen.leaves = TRUE,  # put the leaves on the bottom of the page
  branch = 0.5,  # change angle of branch lines
  branch.lwd = 2,  # width of branch lines
  faclen = 0,  # do not abbreviate factor levels
  trace = 1,  # print the automatically calculated cex
  shadow.col = 0,  # no shadows under the leaves
  branch.lty = 1,  # draw branches using dotted lines
  split.cex = 1.2,  # make the split text larger than the node text
  split.prefix = "is ",  # put "is" before split text
  split.suffix = "?",  # put "?" after split text
  split.box.col = "blue",  # lightgray split boxes (default is white)
  split.col = "white",  # color of text in split box 
  split.border.col = "blue",  # darkgray border on split boxes
  split.round = .25)  # round the split box corners a tad

## cex 0.816   xlim c(0, 1)   ylim c(0, 1)

#dev.off()

# --------------------------------------
# Random forests (simple)
# --------------------------------------
set.seed (9999)  # for reproducibility
rf.train.fit <- randomForest(simple.model, 
  data=houses.train, mtry=3, importance=TRUE, na.action=na.omit) 

# review the random forest solution      
print(rf.train.fit)

## 
## Call:
##  randomForest(formula = simple.model, data = houses.train, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.06991898
##                     % Var explained: 63.7

# check importance of the individual explanatory variables 
# pdf(file = "fig_spatial_random_forest_simple_importance.pdf", 
# width = 11, height = 8.5)
varImpPlot(rf.train.fit, main = "", pch = 20, col = "darkblue")

#dev.off()

# random forest predictions for the training set
houses.train$rf.train.fit.pred <- predict(rf.train.fit, type="class", 
  newdata = houses.train)

# root-mean-squared for random forest on training set
print(rmspe(houses.train$log_value, houses.train$rf.train.fit.pred))

## [1] 0.1146651

# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rf.train.fit.pred)^2)

## [1] 0.9452182

cat("\n\nTraining set proportion of variance accounted",
    "for by random forests (simple model) = ",
  sprintf("%1.3f",
  cor(houses.train$log_value,houses.train$rf.train.fit.pred)^2),sep=" ")

## 
## 
## Training set proportion of variance accounted for by random forests (simple model) =  0.945

# random forest predictions for the test set using model from training set
houses.test$rf.train.fit.pred <- predict(rf.train.fit, 
  type="class", newdata = houses.test)

# root-mean-squared for random forest on test set
print(rmspe(houses.test$log_value, houses.test$rf.train.fit.pred))

## [1] 0.2942473

# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rf.train.fit.pred)^2)

## [1] 0.5646507

cat("\n\nTest set proportion of variance accounted",
    " for by random forests (simple model) = ",
  sprintf("%1.3f",
  cor(houses.test$log_value,houses.test$rf.train.fit.pred)^2),sep=" ")

## 
## 
## Test set proportion of variance accounted  for by random forests (simple model) =  0.565

# --------------------------------------
# Random forests (full)
# --------------------------------------
set.seed (9999)  # for reproducibility
rf.train.fit.full <- randomForest(full.model, 
  data=houses.train, mtry=3, importance=TRUE, na.action=na.omit) 

# review the random forest solution      
print(rf.train.fit.full)

## 
## Call:
##  randomForest(formula = full.model, data = houses.train, mtry = 3,      importance = TRUE, na.action = na.omit) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##           Mean of squared residuals: 0.06328609
##                     % Var explained: 67.15

# check importance of the individual explanatory variables 
#pdf(file = "fig_spatial_random_forest_full_importance.pdf", 
#width = 11, height = 8.5)
varImpPlot(rf.train.fit.full, main = "", pch = 20, 
  cex = 1.25, col = "darkblue", lcolor = "black")

#dev.off()

# random forest predictions for the training set
houses.train$rf.train.fit.full.pred <- predict(rf.train.fit.full, type="class", 
  newdata = houses.train)

# root-mean-squared for random forest on training set
print(rmspe(houses.train$log_value, houses.train$rf.train.fit.full.pred))

## [1] 0.1092729

# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rf.train.fit.full.pred)^2)

## [1] 0.9486865

cat("\n\nTraining set proportion of variance accounted",
    " for by random forests (full model) = ",
  sprintf("%1.3f",cor(houses.train$log_value,
    houses.train$rf.train.fit.full.pred)^2),sep=" ")

## 
## 
## Training set proportion of variance accounted  for by random forests (full model) =  0.949

# random forest predictions for the test set using model from training set
houses.test$rf.train.fit.full.pred <- predict(rf.train.fit.full, type="class", 
  newdata = houses.test)

# root-mean-squared for random forest on test set
print(rmspe(houses.test$log_value, houses.test$rf.train.fit.full.pred))

## [1] 0.263197

# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rf.train.fit.full.pred)^2)

## [1] 0.6511199

cat("\n\nTest set proportion of variance accounted",
    " for by random forests (full model) = ",
  sprintf("%1.3f",cor(houses.test$log_value,
    houses.test$rf.train.fit.full.pred)^2),sep=" ")

## 
## 
## Test set proportion of variance accounted  for by random forests (full model) =  0.651

# --------------------------------------
# Geographically weighted regression
# --------------------------------------    
# bandwidth calculation may take a while
set.bandwidth <-  gwr.sel(pace.barry.model, 
  data=houses.train, verbose = FALSE, show.error.messages = FALSE) 

# fit the geographically-weighted regression with bandwidth value set.bandwidth
gwr.train.fit <- gwr(pace.barry.model, bandwidth = set.bandwidth, 
  predictions = TRUE, data=houses.train, fit.points = houses.train)
# extract training set predictions
houses.train$grw.train.fit.pred <- gwr.train.fit$SDF$pred  

# root-mean-squared for grw on training set
print(rmspe(houses.train$log_value, houses.train$grw.train.fit.pred))

## [1] 0.2343306

# report R-squared on training data
print(cor(houses.train$log_value,houses.train$grw.train.fit.pred)^2)

## [1] 0.715599

cat("\n\nTraining set proportion of variance accounted",
  " for by geographically-weighted regression = ",
  sprintf("%1.3f",cor(houses.train$log_value,
  houses.train$grw.train.fit.pred)^2),sep=" ")

## 
## 
## Training set proportion of variance accounted  for by geographically-weighted regression =  0.716

# fit the geographically-weighted regression with bandwidth value set.bandwidth
# fit to training data and specify test data
gwr.train.fit <- gwr(pace.barry.model, bandwidth = set.bandwidth, 
  predictions = TRUE, data=houses.train, fit.points = houses.test)
# extract test set predictions
houses.test$grw.train.fit.pred <- gwr.train.fit$SDF$pred  

# root-mean-squared for grw on test set
print(rmspe(houses.test$log_value, houses.test$grw.train.fit.pred))

## [1] 0.273895

# report R-squared on training data
print(cor(houses.test$log_value,houses.test$grw.train.fit.pred)^2)

## [1] 0.6265548

cat("\n\nTest set proportion of variance accounted",
  " for by geographically-weighted regression = ",
  sprintf("%1.3f",cor(houses.test$log_value,
  houses.test$grw.train.fit.pred)^2),sep=" ")

## 
## 
## Test set proportion of variance accounted  for by geographically-weighted regression =  0.627

# --------------------------------------
# Gather results for a single report
# --------------------------------------     
# measurement model performance summary
methods <- c("Linear regression Pace and Barry (1997)",
  "Tree-structured regression (simple model)",
  "Tree-structured regression (full model)",
  "Random forests (simple model)",
  "Random forests (full model)",
  "Geographically weighted regression (GWR)")
methods.performance.data.frame <- data.frame(methods)

methods.performance.data.frame$training <- 
  c(round(cor(houses.train$log_value,predict(pace.barry.train.fit))^2
    ,digits=3),
    round(cor(houses.train$log_value,
    houses.train$rpart.train.fit.pred)^2,digits=3),
    round(cor(houses.train$log_value,
    houses.train$rpart.train.fit.full.pred)^2,digits=3),
    round(cor(houses.train$log_value,
    houses.train$rf.train.fit.pred)^2,digits=3),
     round(cor(houses.train$log_value,
     houses.train$rf.train.fit.full.pred)^2,digits=3),
    round(cor(houses.train$log_value,
    houses.train$grw.train.fit.pred)^2,digits=3))
  
methods.performance.data.frame$test <-
  c(round(cor(houses.test$log_value,
  predict(pace.barry.train.fit, newdata = houses.test))^2,digits=3),
    round(cor(houses.test$log_value,
    houses.test$rpart.train.fit.pred)^2,digits=3),
    round(cor(houses.test$log_value,
    houses.test$rpart.train.fit.full.pred)^2,digits=3),
    round(cor(houses.test$log_value,
    houses.test$rf.train.fit.pred)^2,digits=3),
    round(cor(houses.test$log_value,
    houses.test$rf.train.fit.full.pred)^2,digits=3),
    round(cor(houses.test$log_value,
    houses.test$grw.train.fit.pred)^2,digits=3))

print(methods.performance.data.frame)

##                                     methods training  test
## 1   Linear regression Pace and Barry (1997)    0.648 0.562
## 2 Tree-structured regression (simple model)    0.601 0.404
## 3   Tree-structured regression (full model)    0.678 0.574
## 4             Random forests (simple model)    0.945 0.565
## 5               Random forests (full model)    0.949 0.651
## 6  Geographically weighted regression (GWR)    0.716 0.627

Housing Picking

May 8, 2019