library(maps) # making making
library(mapproj) # projections for map making
library(spgwr) # spatially-weighted regression
## Loading required package: sp
## NOTE: This package does not constitute approval of GWR
## as a method of spatial analysis; see example(gwr)
library(rpart) # tree-structured modeling
library(randomForest) # random forests
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
library(rpart.plot) # plot tree-structured model information
library(lattice) # statistical graphics
library(cvTools) # cross-validation tools including rmspe
## Loading required package: robustbase
# read in the housing data
houses <- read.table("C:\\Users\\teeja\\Dropbox\\AppliedDataScience\\IST718_BigDataAnalytics\\Block1_Crawl-master\\houses_data.txt", header = FALSE, sep = "",dec = ".", row.names = NULL,col.names = c("value", "income", "age", "rooms", "bedrooms", "pop", "hh", "latitude", "longitude"))
head(houses)
## value income age rooms bedrooms pop hh latitude longitude
## 1 452600 8.3252 41 880 129 322 126 37.88 -122.23
## 2 358500 8.3014 21 7099 1106 2401 1138 37.86 -122.22
## 3 352100 7.2574 52 1467 190 496 177 37.85 -122.24
## 4 341300 5.6431 52 1274 235 558 219 37.85 -122.25
## 5 342200 3.8462 52 1627 280 565 259 37.85 -122.25
## 6 269700 4.0368 52 919 213 413 193 37.85 -122.25
# computed variables for linear model used by Pace and Barry (1997)
houses$log_value <- log(houses$value)
houses$income_squared <- houses$income^2
houses$income_cubed <- houses$income^3
houses$log_age <- log(houses$age)
houses$log_pc_rooms <- log(houses$rooms / houses$pop)
houses$log_pc_bedrooms <- log(houses$bedrooms / houses$pop)
houses$log_pop_hh <- log(houses$pop / houses$hh)
houses$log_hh <- log(houses$hh)
# structure of the Pace and Barry (1997) model for baseline for comparisons
pace.barry.model <- {log_value ~ income + income_squared +income_cubed + log_age + log_pc_rooms + log_pc_bedrooms +
log_pop_hh + log_hh}
# for comparison let's look at a simple model with the original variables
simple.model <- {log_value ~ income + age + rooms + bedrooms + pop + hh}
# original variables plus variables that add value for trees
# that is... variables that are not simple monotonic transformations
# of the original explanatory variables
full.model <- {log_value ~ income + age + rooms + bedrooms +pop + hh + log_pc_rooms + log_pc_bedrooms + log_pop_hh}
# define variable for selecting a geographically defined
# subset of the data... San Diego area
# we use nested ifelse statements to do this
# define the bounding box for selecting the area
# here we are selecting the San Diego region
BB.TOP <- 33
BB.BOTTOM <- 32
BB.RIGHT <- -116.75
BB.LEFT <- -125
houses$select <- ifelse(((houses$latitude < BB.TOP)),ifelse((houses$longitude < BB.RIGHT),ifelse((houses$latitude > BB.BOTTOM),
ifelse((houses$longitude > BB.LEFT),1,2),2),2),2)
houses$select <- factor(houses$select, levels = c(1,2), labels = c("Selected","Not Selected"))
houses.selected <- subset(houses, subset = (select == "Selected"))
houses.notselected <- subset(houses, subset = (select == "Not Selected"))
# plot the locations of block groups red in the selected area, blue otherwise
#pdf(file = "fig_spatial_map_selected_region.pdf", width = 8.5, height = 8.5)
pointsize <- 0.5
map("state", region = c("california"), project="albers",par=c(39,45))
points(mapproject(houses.selected$longitude, houses.selected$latitude,
projection=""),pch=20,cex=pointsize,col="red")
points(mapproject(houses.notselected$longitude, houses.notselected$latitude,
projection=""),pch=20,cex=pointsize,col="darkblue")
legend("right", legend = c("Selected Region","Not Selected"),
col = c("red","darkblue"), pch = 20)
map.scale()

#dev.off()
# define training and test sets for the selected houses
set.seed(4444)
partition <- sample(nrow(houses.selected)) # permuted list of row index numbers
houses.selected$Group <- ifelse((partition < nrow(houses.selected)/(3/2)),1,2)
houses.selected$Group <- factor(houses.selected$Group,levels=c(1,2),labels=c("TRAIN","TEST"))
print(table(houses.selected$Group)) # review the split into training and test
##
## TRAIN TEST
## 803 403
print(head(houses.selected)) # review the selected data
## value income age rooms bedrooms pop hh latitude longitude
## 14006 441700 4.7396 52 2023 301 649 285 32.76 -117.18
## 14007 408500 5.3920 52 1539 212 535 224 32.75 -117.18
## 14008 459600 8.6030 52 1504 208 518 196 32.75 -117.18
## 14009 500001 8.1548 52 1495 230 459 190 32.75 -117.19
## 14010 411600 6.1309 52 1388 213 513 211 32.75 -117.19
## 14011 500001 5.7914 52 1294 175 434 180 32.76 -117.19
## log_value income_squared income_cubed log_age log_pc_rooms
## 14006 12.99839 22.46381 106.4695 3.951244 1.1369041
## 14007 12.92025 29.07366 156.7652 3.951244 1.0566214
## 14008 13.03811 74.01161 636.7219 3.951244 1.0659083
## 14009 13.12237 66.50076 542.3004 3.951244 1.1808313
## 14010 12.92781 37.58793 230.4479 3.951244 0.9953433
## 14011 13.12237 33.54031 194.2454 3.951244 1.0924489
## log_pc_bedrooms log_pop_hh log_hh select Group
## 14006 -0.7683225 0.8229435 5.652489 Selected TEST
## 14007 -0.9256805 0.8706207 5.411646 Selected TRAIN
## 14008 -0.9124372 0.9718606 5.278115 Selected TRAIN
## 14009 -0.6909709 0.8820261 5.247024 Selected TRAIN
## 14010 -0.8789837 0.8884177 5.351858 Selected TRAIN
## 14011 -0.9082586 0.8800877 5.192957 Selected TRAIN
houses.train <- subset(houses.selected, subset = (Group == "TRAIN"))
houses.test <- subset(houses.selected, subset = (Group == "TEST"))
# examine the correlations across the variables before we begin modeling
houses.train.df.vars <- houses.train[,c("log_value","income","age","rooms","bedrooms","pop","hh","log_pc_rooms","log_pc_bedrooms","log_pop_hh")]
houses.train.cormat <- cor(as.matrix(houses.train.df.vars))
houses.train.cormat.line <- houses.train.cormat["log_value",]
# explanatory variables ordered by correlation with the response variable
ordered.houses.train.cormat <- houses.train.cormat[names(sort(houses.train.cormat.line,decreasing=TRUE)),
names(sort(houses.train.cormat.line,decreasing=FALSE))]
# code to obtain default colors from ggplot2...
number.of.default.colors <- 2 # two end-points for our scale
end.point.colors <- hcl(h=seq(15, 375-360/number.of.default.colors,length=number.of.default.colors)%%360, c=100, l=65)
# end.point.colors[1] and [2] used to define the three-point color scale
# pdf(file = "fig_spatial_correlation_heat_map.pdf", width = 11,
# height = 8.5)
x <- rep(1:nrow(ordered.houses.train.cormat),times=ncol(ordered.houses.train.cormat))
y <- NULL
for (i in 1:ncol(ordered.houses.train.cormat))
y <- c(y,rep(i,times=nrow(ordered.houses.train.cormat)))
# use fixed format 0.XXX in cells of correlation matrix
cortext <- sprintf("%0.3f", as.numeric(ordered.houses.train.cormat))
text.data.frame <- data.frame(x, y, cortext)
text.data.frame$cortext <- as.character(text.data.frame$cortext)
text.data.frame$cortext <- ifelse((text.data.frame$cortext == "1.000"),
NA,text.data.frame$cortext) # define diagonal cells as missing
text.data.frame <- na.omit(text.data.frame) # diagonal cells have no text
print(levelplot(ordered.houses.train.cormat, cuts = 25, tick.number = 9,
col.regions =
colorRampPalette(c(end.point.colors[1], "white", end.point.colors[2])),
scales=list(tck=0, x=list(rot=90)),
xlab = "",
ylab = "",
panel = function(...) {
panel.levelplot(...)
panel.text(text.data.frame$x, text.data.frame$y,
labels = text.data.frame$cortext)
}))

#dev.off()
# scatter plot matrix (splom) demonstration
houses.train.splom.vars <- houses.train[,c("log_value","income","age","rooms")]
# pdf(file = "fig_spatial_sample_splom.pdf", width = 8.5,
# height = 8.5)
pairs(houses.train.splom.vars, cex = 0.65, col = "darkblue")

#dev.off()
# define spatial objects as needed for spatial modeling work
# explanation of spatial objects may be found in chapter 2 of
# Bivand, R. S., Pebesma, E. J., and Gomez-Rubio, V. (2008)
# Applied Spatial Data Analysis, New York: Springer.
# this involves adding coordinate objects to data frame objects
# training set coordinates to add
houses.coord <- cbind(houses.train$longitude,houses.train$latitude)
# define spatial points data frame object
houses.train <- SpatialPointsDataFrame(houses.coord,houses.train,bbox = NULL)
# test set coordinates to add
houses.coord <- cbind(houses.test$longitude,houses.test$latitude)
# define spatial points data frame object
houses.test <- SpatialPointsDataFrame(houses.coord,houses.test,bbox = NULL)
# examine the struction of the spatial points data frame
print(str(houses.train))
## Formal class 'SpatialPointsDataFrame' [package "sp"] with 5 slots
## ..@ data :'data.frame': 803 obs. of 19 variables:
## .. ..$ value : num [1:803] 408500 459600 500001 411600 500001 ...
## .. ..$ income : num [1:803] 5.39 8.6 8.15 6.13 5.79 ...
## .. ..$ age : num [1:803] 52 52 52 52 52 36 23 49 52 21 ...
## .. ..$ rooms : num [1:803] 1539 1504 1495 1388 1294 ...
## .. ..$ bedrooms : num [1:803] 212 208 230 213 175 534 594 494 262 486 ...
## .. ..$ pop : num [1:803] 535 518 459 513 434 ...
## .. ..$ hh : num [1:803] 224 196 190 211 180 531 536 419 249 482 ...
## .. ..$ latitude : num [1:803] 32.8 32.8 32.8 32.8 32.8 ...
## .. ..$ longitude : num [1:803] -117 -117 -117 -117 -117 ...
## .. ..$ log_value : num [1:803] 12.9 13 13.1 12.9 13.1 ...
## .. ..$ income_squared : num [1:803] 29.1 74 66.5 37.6 33.5 ...
## .. ..$ income_cubed : num [1:803] 157 637 542 230 194 ...
## .. ..$ log_age : num [1:803] 3.95 3.95 3.95 3.95 3.95 ...
## .. ..$ log_pc_rooms : num [1:803] 1.057 1.066 1.181 0.995 1.092 ...
## .. ..$ log_pc_bedrooms: num [1:803] -0.926 -0.912 -0.691 -0.879 -0.908 ...
## .. ..$ log_pop_hh : num [1:803] 0.871 0.972 0.882 0.888 0.88 ...
## .. ..$ log_hh : num [1:803] 5.41 5.28 5.25 5.35 5.19 ...
## .. ..$ select : Factor w/ 2 levels "Selected","Not Selected": 1 1 1 1 1 1 1 1 1 1 ...
## .. ..$ Group : Factor w/ 2 levels "TRAIN","TEST": 1 1 1 1 1 1 1 1 1 1 ...
## ..@ coords.nrs : num(0)
## ..@ coords : num [1:803, 1:2] -117 -117 -117 -117 -117 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:2] "coords.x1" "coords.x2"
## ..@ bbox : num [1:2, 1:2] -117.3 32.5 -116.8 33
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:2] "coords.x1" "coords.x2"
## .. .. ..$ : chr [1:2] "min" "max"
## ..@ proj4string:Formal class 'CRS' [package "sp"] with 1 slot
## .. .. ..@ projargs: chr NA
## NULL
# --------------------------------------------
# Linear regression a la Pace and Barry (1997)
# --------------------------------------------
pace.barry.train.fit <- lm(pace.barry.model, data = houses.train)
print(pace.barry.train.fit)
##
## Call:
## lm(formula = pace.barry.model, data = houses.train)
##
## Coefficients:
## (Intercept) income income_squared income_cubed
## 11.328534 0.109107 0.017346 -0.001131
## log_age log_pc_rooms log_pc_bedrooms log_pop_hh
## 0.105285 0.141457 -0.028180 -0.546219
## log_hh
## 0.038683
print(summary(pace.barry.train.fit))
##
## Call:
## lm(formula = pace.barry.model, data = houses.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.41721 -0.16599 -0.02227 0.13766 1.03596
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.3285340 0.1714460 66.076 < 2e-16 ***
## income 0.1091066 0.0468096 2.331 0.02001 *
## income_squared 0.0173457 0.0077151 2.248 0.02483 *
## income_cubed -0.0011315 0.0003904 -2.898 0.00386 **
## log_age 0.1052854 0.0208900 5.040 5.77e-07 ***
## log_pc_rooms 0.1414570 0.0777626 1.819 0.06927 .
## log_pc_bedrooms -0.0281798 0.1217779 -0.231 0.81706
## log_pop_hh -0.5462186 0.1259003 -4.339 1.62e-05 ***
## log_hh 0.0386832 0.0173577 2.229 0.02612 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.262 on 794 degrees of freedom
## Multiple R-squared: 0.6476, Adjusted R-squared: 0.6441
## F-statistic: 182.4 on 8 and 794 DF, p-value: < 2.2e-16
# direct calculation of root-mean-squared prediction error
# obtained directly on the training data
print(rmspe(houses.train$log_value, predict(pace.barry.train.fit)))
## [1] 0.2605285
# report R-squared on training data
print(cor(houses.train$log_value,predict(pace.barry.train.fit))^2)
## [1] 0.647627
cat("\n\nTraining set proportion of variance accounted",
" for by linear regression = ",
sprintf("%1.3f",cor(houses.train$log_value,
predict(pace.barry.train.fit))^2),sep=" ")
##
##
## Training set proportion of variance accounted for by linear regression = 0.648
# test model fit to training set on the test set
print(rmspe(houses.test$log_value, predict(pace.barry.train.fit,
newdata = houses.test)))
## [1] 0.2968635
print(cor(houses.test$log_value,
predict(pace.barry.train.fit, newdata = houses.test))^2)
## [1] 0.5622463
cat("\n\nTest set proportion of variance accounted",
" for by linear regression = ",
sprintf("%1.3f",cor(houses.test$log_value,
predict(pace.barry.train.fit, newdata = houses.test))^2),sep=" ")
##
##
## Test set proportion of variance accounted for by linear regression = 0.562
# demonstrate cross-validation within the training set
# specify ten-fold cross-validation within the training set
# K = folds R = replications of K-fold cross-validation
set.seed(1234) # for reproducibility
folds <- cvFolds(nrow(houses.train), K = 10, R = 50)
cv.pace.barry.train.fit <- cvLm(pace.barry.train.fit, cost = rtmspe, folds = folds, trim = 0.1)
# root-mean-squared prediction error estimated by cross-validation
print(cv.pace.barry.train.fit)
## 10-fold CV results:
## CV
## 0.1822949
# --------------------------------------
# Tree-structured regression (simple)
# --------------------------------------
# try tree-structured regression on the original explantory variables
# note that one of the advantages of trees is no need for transformations
# of the explanatory variables
rpart.train.fit <- rpart(simple.model, data = houses.train)
print(summary(rpart.train.fit)) # tree summary statistics and split detail
## Call:
## rpart(formula = simple.model, data = houses.train)
## n= 803
##
## CP nsplit rel error xerror xstd
## 1 0.29410366 0 1.0000000 1.0019589 0.05631528
## 2 0.14976880 1 0.7058963 0.7170898 0.04570448
## 3 0.02706349 2 0.5561275 0.5737357 0.03610174
## 4 0.02349265 3 0.5290640 0.5737048 0.03691744
## 5 0.01741974 5 0.4820787 0.5386605 0.03596446
## 6 0.01324194 9 0.4123998 0.5217198 0.03429556
## 7 0.01000000 10 0.3991578 0.5177536 0.03469061
##
## Variable importance
## income age pop rooms bedrooms hh
## 69 7 7 6 5 5
##
## Node number 1: 803 observations, complexity param=0.2941037
## mean=12.05081, MSE=0.192623
## left son=2 (696 obs) right son=3 (107 obs)
## Primary splits:
## income < 5.32885 to the left, improve=0.29410370, (0 missing)
## rooms < 1024 to the left, improve=0.06578183, (0 missing)
## pop < 1183.5 to the right, improve=0.04183966, (0 missing)
## bedrooms < 77.5 to the left, improve=0.01754284, (0 missing)
## hh < 83.5 to the left, improve=0.01586739, (0 missing)
## Surrogate splits:
## rooms < 15925.5 to the left, agree=0.870, adj=0.028, (0 split)
## pop < 7643.5 to the left, agree=0.869, adj=0.019, (0 split)
## bedrooms < 2672 to the left, agree=0.868, adj=0.009, (0 split)
##
## Node number 2: 696 observations, complexity param=0.1497688
## mean=11.95748, MSE=0.1406436
## left son=4 (166 obs) right son=5 (530 obs)
## Primary splits:
## income < 2.2561 to the left, improve=0.23665500, (0 missing)
## rooms < 1069 to the left, improve=0.07205139, (0 missing)
## pop < 1181 to the right, improve=0.04230318, (0 missing)
## hh < 240.5 to the left, improve=0.02710874, (0 missing)
## bedrooms < 255.5 to the left, improve=0.02186221, (0 missing)
## Surrogate splits:
## rooms < 981.5 to the left, agree=0.777, adj=0.066, (0 split)
## age < 46.5 to the right, agree=0.773, adj=0.048, (0 split)
## bedrooms < 23 to the left, agree=0.764, adj=0.012, (0 split)
## pop < 67.5 to the left, agree=0.764, adj=0.012, (0 split)
## hh < 28.5 to the left, agree=0.764, adj=0.012, (0 split)
##
## Node number 3: 107 observations, complexity param=0.02706349
## mean=12.65785, MSE=0.1055836
## left son=6 (71 obs) right son=7 (36 obs)
## Primary splits:
## income < 6.81245 to the left, improve=0.37053310, (0 missing)
## age < 33.5 to the left, improve=0.10686860, (0 missing)
## pop < 735.5 to the right, improve=0.08459233, (0 missing)
## bedrooms < 1287.5 to the right, improve=0.03635599, (0 missing)
## hh < 267.5 to the right, improve=0.03145908, (0 missing)
## Surrogate splits:
## hh < 200.5 to the right, agree=0.673, adj=0.028, (0 split)
##
## Node number 4: 166 observations, complexity param=0.01324194
## mean=11.63149, MSE=0.1015664
## left son=8 (18 obs) right son=9 (148 obs)
## Primary splits:
## income < 1.2967 to the left, improve=0.12148340, (0 missing)
## rooms < 945.5 to the left, improve=0.08376565, (0 missing)
## hh < 396.5 to the left, improve=0.06708615, (0 missing)
## bedrooms < 325.5 to the left, improve=0.05926134, (0 missing)
## pop < 824 to the right, improve=0.04361732, (0 missing)
##
## Node number 5: 530 observations, complexity param=0.02349265
## mean=12.05958, MSE=0.109174
## left son=10 (370 obs) right son=11 (160 obs)
## Primary splits:
## income < 4.1186 to the left, improve=0.057377380, (0 missing)
## age < 38.5 to the left, improve=0.052121850, (0 missing)
## pop < 783.5 to the right, improve=0.048423740, (0 missing)
## rooms < 733.5 to the left, improve=0.029877300, (0 missing)
## bedrooms < 195 to the left, improve=0.008404641, (0 missing)
## Surrogate splits:
## rooms < 6703.5 to the left, agree=0.713, adj=0.050, (0 split)
## age < 16.5 to the right, agree=0.711, adj=0.044, (0 split)
## pop < 4120 to the left, agree=0.708, adj=0.031, (0 split)
## hh < 1664 to the left, agree=0.704, adj=0.019, (0 split)
## bedrooms < 1335 to the left, agree=0.702, adj=0.012, (0 split)
##
## Node number 6: 71 observations
## mean=12.517, MSE=0.07368271
##
## Node number 7: 36 observations
## mean=12.93562, MSE=0.05221936
##
## Node number 8: 18 observations
## mean=11.31298, MSE=0.1497932
##
## Node number 9: 148 observations
## mean=11.67023, MSE=0.08186173
##
## Node number 10: 370 observations, complexity param=0.01741974
## mean=12.00754, MSE=0.1067343
## left son=20 (183 obs) right son=21 (187 obs)
## Primary splits:
## pop < 1134.5 to the right, improve=0.06616804, (0 missing)
## age < 17.5 to the left, improve=0.04097617, (0 missing)
## rooms < 733.5 to the left, improve=0.03624431, (0 missing)
## hh < 236 to the left, improve=0.02571397, (0 missing)
## income < 2.78415 to the left, improve=0.01964311, (0 missing)
## Surrogate splits:
## rooms < 2488 to the right, agree=0.862, adj=0.721, (0 split)
## hh < 426 to the right, agree=0.827, adj=0.650, (0 split)
## bedrooms < 499 to the right, agree=0.808, adj=0.612, (0 split)
## age < 19.5 to the left, agree=0.654, adj=0.301, (0 split)
## income < 3.27555 to the right, agree=0.559, adj=0.109, (0 split)
##
## Node number 11: 160 observations, complexity param=0.02349265
## mean=12.17994, MSE=0.09406603
## left son=22 (151 obs) right son=23 (9 obs)
## Primary splits:
## age < 38.5 to the left, improve=0.26228430, (0 missing)
## pop < 1301 to the right, improve=0.06466240, (0 missing)
## income < 5.04175 to the left, improve=0.01778715, (0 missing)
## hh < 281.5 to the right, improve=0.01444560, (0 missing)
## rooms < 1706.5 to the right, improve=0.01114705, (0 missing)
##
## Node number 20: 183 observations
## mean=11.92259, MSE=0.06587219
##
## Node number 21: 187 observations, complexity param=0.01741974
## mean=12.09067, MSE=0.1327486
## left son=42 (79 obs) right son=43 (108 obs)
## Primary splits:
## bedrooms < 334.5 to the left, improve=0.09235448, (0 missing)
## hh < 236 to the left, improve=0.08679052, (0 missing)
## rooms < 733.5 to the left, improve=0.08668065, (0 missing)
## age < 17.5 to the left, improve=0.07793659, (0 missing)
## pop < 361.5 to the left, improve=0.03213004, (0 missing)
## Surrogate splits:
## hh < 310 to the left, agree=0.930, adj=0.835, (0 split)
## rooms < 1518.5 to the left, agree=0.866, adj=0.684, (0 split)
## pop < 704.5 to the left, agree=0.759, adj=0.430, (0 split)
## income < 3.45665 to the right, agree=0.626, adj=0.114, (0 split)
## age < 51 to the right, agree=0.583, adj=0.013, (0 split)
##
## Node number 22: 151 observations
## mean=12.14159, MSE=0.07145599
##
## Node number 23: 9 observations
## mean=12.82332, MSE=0.03479818
##
## Node number 42: 79 observations
## mean=11.96121, MSE=0.08618303
##
## Node number 43: 108 observations, complexity param=0.01741974
## mean=12.18537, MSE=0.1455826
## left son=86 (9 obs) right son=87 (99 obs)
## Primary splits:
## age < 16.5 to the left, improve=0.15587770, (0 missing)
## pop < 781 to the right, improve=0.14919700, (0 missing)
## rooms < 2378.5 to the left, improve=0.05757607, (0 missing)
## bedrooms < 351.5 to the right, improve=0.03580898, (0 missing)
## hh < 573 to the left, improve=0.01978496, (0 missing)
## Surrogate splits:
## bedrooms < 660.5 to the right, agree=0.935, adj=0.222, (0 split)
##
## Node number 86: 9 observations
## mean=11.68575, MSE=0.09491246
##
## Node number 87: 99 observations, complexity param=0.01741974
## mean=12.23079, MSE=0.1254329
## left son=174 (76 obs) right son=175 (23 obs)
## Primary splits:
## pop < 781 to the right, improve=0.27550090, (0 missing)
## rooms < 1545 to the right, improve=0.05738988, (0 missing)
## hh < 573 to the left, improve=0.05195104, (0 missing)
## bedrooms < 612 to the left, improve=0.04391211, (0 missing)
## income < 2.91 to the left, improve=0.04362614, (0 missing)
## Surrogate splits:
## hh < 340 to the right, agree=0.859, adj=0.391, (0 split)
## rooms < 1545 to the right, agree=0.828, adj=0.261, (0 split)
## bedrooms < 357 to the right, agree=0.788, adj=0.087, (0 split)
##
## Node number 174: 76 observations
## mean=12.12853, MSE=0.08623869
##
## Node number 175: 23 observations
## mean=12.56871, MSE=0.1061994
##
## n= 803
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 803 154.6762000 12.05081
## 2) income< 5.32885 696 97.8879400 11.95748
## 4) income< 2.2561 166 16.8600300 11.63149
## 8) income< 1.2967 18 2.6962780 11.31298 *
## 9) income>=1.2967 148 12.1155400 11.67023 *
## 5) income>=2.2561 530 57.8622300 12.05958
## 10) income< 4.1186 370 39.4916900 12.00754
## 20) pop>=1134.5 183 12.0546100 11.92259 *
## 21) pop< 1134.5 187 24.8239900 12.09067
## 42) bedrooms< 334.5 79 6.8084590 11.96121 *
## 43) bedrooms>=334.5 108 15.7229200 12.18537
## 86) age< 16.5 9 0.8542122 11.68575 *
## 87) age>=16.5 99 12.4178600 12.23079
## 174) pop>=781 76 6.5541410 12.12853 *
## 175) pop< 781 23 2.4425850 12.56871 *
## 11) income>=4.1186 160 15.0505600 12.17994
## 22) age< 38.5 151 10.7898500 12.14159 *
## 23) age>=38.5 9 0.3131836 12.82332 *
## 3) income>=5.32885 107 11.2974500 12.65785
## 6) income< 6.81245 71 5.2314720 12.51700 *
## 7) income>=6.81245 36 1.8798970 12.93562 *
houses.train$rpart.train.fit.pred <- predict(rpart.train.fit, data = houses.train)
# root-mean-squared for trees on training set
print(rmspe(houses.train$log_value, houses.train$rpart.train.fit.pred))
## [1] 0.277285
# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rpart.train.fit.pred)^2)
## [1] 0.6008422
cat("\n\nTraining set proportion of variance accounted",
" for by tree-structured regression = ",
sprintf("%1.3f",cor(houses.train$log_value,
houses.train$rpart.train.fit.pred)^2),sep=" ")
##
##
## Training set proportion of variance accounted for by tree-structured regression = 0.601
# root-mean-squared for trees on test set
houses.test$rpart.train.fit.pred <- predict(rpart.train.fit, newdata = houses.test)
print(rmspe(houses.test$log_value, houses.test$rpart.train.fit.pred))
## [1] 0.3478524
# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rpart.train.fit.pred)^2)
## [1] 0.4037499
cat("\n\nTest set proportion of variance accounted",
" for by tree-structured regression = ",
sprintf("%1.3f",
cor(houses.test$log_value,houses.test$rpart.train.fit.pred)^2),sep=" ")
##
##
## Test set proportion of variance accounted for by tree-structured regression = 0.404
# plot the regression tree result from rpart
#pdf(file = "fig_spatial_rpart_model.pdf", width = 8.5, height = 8.5)
prp(rpart.train.fit, main="",
digits = 3, # digits to display in terminal nodes
nn = TRUE, # display the node numbers
fallen.leaves = TRUE, # put the leaves on the bottom of the page
branch = 0.5, # change angle of branch lines
branch.lwd = 2, # width of branch lines
faclen = 0, # do not abbreviate factor levels
trace = 1, # print the automatically calculated cex
shadow.col = 0, # no shadows under the leaves
branch.lty = 1, # draw branches using dotted lines
split.cex = 1.2, # make the split text larger than the node text
split.prefix = "is ", # put "is " before split text
split.suffix = "?", # put "?" after split text
split.box.col = "blue", # lightgray split boxes (default is white)
split.col = "white", # color of text in split box
split.border.col = "blue", # darkgray border on split boxes
split.round = .25) # round the split box corners a tad
## cex 1 xlim c(0, 1) ylim c(0, 1)

#dev.off()
# --------------------------------------
# Tree-structured regression (full)
# --------------------------------------
# try tree-structured regression on the expanded set of variables
rpart.train.fit.full <- rpart(full.model, data = houses.train)
print(summary(rpart.train.fit.full)) # tree summary statistics and split detail
## Call:
## rpart(formula = full.model, data = houses.train)
## n= 803
##
## CP nsplit rel error xerror xstd
## 1 0.36957978 0 1.0000000 1.0018854 0.05623900
## 2 0.09253626 1 0.6304202 0.6584266 0.04000090
## 3 0.07954998 2 0.5378840 0.5860452 0.03564882
## 4 0.03441113 3 0.4583340 0.5015991 0.03371739
## 5 0.02944949 4 0.4239228 0.4951772 0.03375549
## 6 0.01801253 5 0.3944734 0.4654073 0.03219833
## 7 0.01597595 6 0.3764608 0.4325397 0.03064742
## 8 0.01159800 8 0.3445089 0.4223959 0.02909689
## 9 0.01052854 9 0.3329109 0.4020443 0.02743957
## 10 0.01000000 10 0.3223824 0.3911207 0.02648231
##
## Variable importance
## log_pc_rooms income log_pop_hh log_pc_bedrooms
## 42 26 12 10
## age pop rooms bedrooms
## 4 3 2 1
## hh
## 1
##
## Node number 1: 803 observations, complexity param=0.3695798
## mean=12.05081, MSE=0.192623
## left son=2 (543 obs) right son=3 (260 obs)
## Primary splits:
## log_pc_rooms < 0.776981 to the left, improve=0.36957980, (0 missing)
## income < 5.32885 to the left, improve=0.29410370, (0 missing)
## log_pop_hh < 0.9770098 to the right, improve=0.10676160, (0 missing)
## log_pc_bedrooms < -1.003109 to the left, improve=0.08702279, (0 missing)
## rooms < 1024 to the left, improve=0.06578183, (0 missing)
## Surrogate splits:
## income < 5.3914 to the left, agree=0.745, adj=0.212, (0 split)
## log_pop_hh < 0.8957548 to the right, agree=0.724, adj=0.146, (0 split)
## log_pc_bedrooms < -0.7291228 to the left, agree=0.712, adj=0.112, (0 split)
## pop < 659.5 to the right, agree=0.695, adj=0.058, (0 split)
## age < 43.5 to the left, agree=0.681, adj=0.015, (0 split)
##
## Node number 2: 543 observations, complexity param=0.09253626
## mean=11.86618, MSE=0.1041501
## left son=4 (207 obs) right son=5 (336 obs)
## Primary splits:
## income < 2.56845 to the left, improve=0.25309060, (0 missing)
## log_pc_rooms < 0.5121832 to the left, improve=0.25271850, (0 missing)
## rooms < 1069 to the left, improve=0.07670465, (0 missing)
## log_pop_hh < 1.15877 to the right, improve=0.05329798, (0 missing)
## age < 36.5 to the right, improve=0.04997564, (0 missing)
## Surrogate splits:
## log_pc_rooms < 0.301507 to the left, agree=0.740, adj=0.319, (0 split)
## age < 38.5 to the right, agree=0.681, adj=0.164, (0 split)
## log_pc_bedrooms < -0.7598571 to the right, agree=0.659, adj=0.106, (0 split)
## log_pop_hh < 0.8096456 to the left, agree=0.657, adj=0.101, (0 split)
## rooms < 1289 to the left, agree=0.656, adj=0.097, (0 split)
##
## Node number 3: 260 observations, complexity param=0.07954998
## mean=12.43639, MSE=0.1575289
## left son=6 (196 obs) right son=7 (64 obs)
## Primary splits:
## income < 5.7841 to the left, improve=0.30042080, (0 missing)
## log_pc_rooms < 0.9569221 to the left, improve=0.16638650, (0 missing)
## log_pop_hh < 0.6512064 to the left, improve=0.08148162, (0 missing)
## bedrooms < 533.5 to the right, improve=0.04063897, (0 missing)
## hh < 275 to the right, improve=0.03908611, (0 missing)
## Surrogate splits:
## log_pc_bedrooms < -1.05175 to the right, agree=0.823, adj=0.281, (0 split)
## log_pop_hh < 1.118858 to the left, agree=0.815, adj=0.250, (0 split)
## log_pc_rooms < 1.065515 to the left, agree=0.812, adj=0.234, (0 split)
## age < 10.5 to the right, agree=0.769, adj=0.063, (0 split)
## rooms < 5822.5 to the left, agree=0.762, adj=0.031, (0 split)
##
## Node number 4: 207 observations, complexity param=0.03441113
## mean=11.65933, MSE=0.09056042
## left son=8 (132 obs) right son=9 (75 obs)
## Primary splits:
## log_pop_hh < 0.9273377 to the right, improve=0.2839316, (0 missing)
## log_pc_rooms < 0.2758711 to the left, improve=0.2568647, (0 missing)
## log_pc_bedrooms < -1.002417 to the left, improve=0.2354202, (0 missing)
## income < 1.2967 to the left, improve=0.1453733, (0 missing)
## rooms < 1058 to the left, improve=0.1212385, (0 missing)
## Surrogate splits:
## log_pc_bedrooms < -0.8692357 to the left, agree=0.957, adj=0.880, (0 split)
## log_pc_rooms < 0.4376137 to the left, agree=0.894, adj=0.707, (0 split)
## pop < 884.5 to the right, agree=0.671, adj=0.093, (0 split)
## bedrooms < 1342 to the left, agree=0.657, adj=0.053, (0 split)
## hh < 1221.5 to the left, agree=0.657, adj=0.053, (0 split)
##
## Node number 5: 336 observations, complexity param=0.02944949
## mean=11.99361, MSE=0.06992364
## left son=10 (315 obs) right son=11 (21 obs)
## Primary splits:
## income < 5.53935 to the left, improve=0.19388220, (0 missing)
## log_pc_rooms < 0.6163027 to the left, improve=0.16945780, (0 missing)
## age < 6.5 to the right, improve=0.05711924, (0 missing)
## rooms < 733.5 to the left, improve=0.04398671, (0 missing)
## log_pop_hh < 0.8972743 to the right, improve=0.03393714, (0 missing)
## Surrogate splits:
## rooms < 15925.5 to the left, agree=0.943, adj=0.095, (0 split)
## pop < 7643.5 to the left, agree=0.943, adj=0.095, (0 split)
## bedrooms < 2672 to the left, agree=0.940, adj=0.048, (0 split)
##
## Node number 6: 196 observations, complexity param=0.01597595
## mean=12.31208, MSE=0.119794
## left son=12 (55 obs) right son=13 (141 obs)
## Primary splits:
## age < 20.5 to the left, improve=0.10308650, (0 missing)
## income < 2.23565 to the left, improve=0.06284607, (0 missing)
## log_pc_rooms < 0.9569221 to the left, improve=0.04314497, (0 missing)
## log_pc_bedrooms < -0.3007574 to the right, improve=0.02862553, (0 missing)
## log_pop_hh < 0.6512064 to the left, improve=0.02666760, (0 missing)
## Surrogate splits:
## bedrooms < 916.5 to the right, agree=0.755, adj=0.127, (0 split)
## hh < 859 to the right, agree=0.745, adj=0.091, (0 split)
## pop < 2103.5 to the right, agree=0.740, adj=0.073, (0 split)
## log_pc_bedrooms < -0.4614824 to the right, agree=0.740, adj=0.073, (0 split)
## rooms < 4986.5 to the right, agree=0.735, adj=0.055, (0 split)
##
## Node number 7: 64 observations
## mean=12.81709, MSE=0.08083434
##
## Node number 8: 132 observations, complexity param=0.01052854
## mean=11.53846, MSE=0.06584593
## left son=16 (60 obs) right son=17 (72 obs)
## Primary splits:
## income < 1.82915 to the left, improve=0.1873652, (0 missing)
## rooms < 1282.5 to the left, improve=0.1694186, (0 missing)
## log_pc_rooms < -0.1411264 to the left, improve=0.1398044, (0 missing)
## hh < 380.5 to the left, improve=0.1392920, (0 missing)
## age < 47.5 to the right, improve=0.1168654, (0 missing)
## Surrogate splits:
## log_pc_rooms < 0.1618422 to the left, agree=0.712, adj=0.367, (0 split)
## rooms < 1244 to the left, agree=0.674, adj=0.283, (0 split)
## bedrooms < 330.5 to the left, agree=0.636, adj=0.200, (0 split)
## hh < 353 to the left, agree=0.636, adj=0.200, (0 split)
## log_pop_hh < 1.249465 to the right, agree=0.629, adj=0.183, (0 split)
##
## Node number 9: 75 observations
## mean=11.87206, MSE=0.06309013
##
## Node number 10: 315 observations, complexity param=0.01801253
## mean=11.96355, MSE=0.05745312
## left son=20 (177 obs) right son=21 (138 obs)
## Primary splits:
## log_pc_rooms < 0.6163027 to the left, improve=0.15394810, (0 missing)
## log_pop_hh < 1.157636 to the right, improve=0.07602275, (0 missing)
## log_pc_bedrooms < -1.190874 to the left, improve=0.06618601, (0 missing)
## income < 4.4952 to the left, improve=0.05620627, (0 missing)
## rooms < 733.5 to the left, improve=0.04738127, (0 missing)
## Surrogate splits:
## log_pop_hh < 1.107112 to the right, agree=0.759, adj=0.449, (0 split)
## log_pc_bedrooms < -1.020448 to the left, agree=0.746, adj=0.420, (0 split)
## age < 34.5 to the left, agree=0.594, adj=0.072, (0 split)
## income < 4.29695 to the left, agree=0.584, adj=0.051, (0 split)
## rooms < 3512.5 to the left, agree=0.581, adj=0.043, (0 split)
##
## Node number 11: 21 observations
## mean=12.44456, MSE=0.04007016
##
## Node number 12: 55 observations, complexity param=0.011598
## mean=12.13415, MSE=0.09697877
## left son=24 (15 obs) right son=25 (40 obs)
## Primary splits:
## income < 3.00695 to the left, improve=0.33633130, (0 missing)
## log_pop_hh < 0.5150801 to the left, improve=0.19544110, (0 missing)
## log_pc_bedrooms < -0.4318298 to the right, improve=0.10486650, (0 missing)
## age < 15.5 to the right, improve=0.08412380, (0 missing)
## rooms < 4261 to the left, improve=0.06709558, (0 missing)
## Surrogate splits:
## log_pop_hh < 0.5470277 to the left, agree=0.873, adj=0.533, (0 split)
## bedrooms < 1497 to the right, agree=0.782, adj=0.200, (0 split)
## hh < 1271.5 to the right, agree=0.782, adj=0.200, (0 split)
## log_pc_rooms < 0.7841115 to the left, agree=0.764, adj=0.133, (0 split)
## log_pc_bedrooms < -0.5222446 to the right, agree=0.764, adj=0.133, (0 split)
##
## Node number 13: 141 observations, complexity param=0.01597595
## mean=12.38149, MSE=0.1115274
## left son=26 (115 obs) right son=27 (26 obs)
## Primary splits:
## log_pc_rooms < 0.9391865 to the left, improve=0.16036310, (0 missing)
## log_pc_bedrooms < -0.7908979 to the left, improve=0.09001682, (0 missing)
## log_pop_hh < 0.924853 to the right, improve=0.07685060, (0 missing)
## pop < 789 to the right, improve=0.04200630, (0 missing)
## income < 2.30285 to the left, improve=0.03826700, (0 missing)
## Surrogate splits:
## log_pop_hh < 0.4281059 to the right, agree=0.837, adj=0.115, (0 split)
## log_pc_bedrooms < -0.3760315 to the left, agree=0.830, adj=0.077, (0 split)
## pop < 387 to the right, agree=0.823, adj=0.038, (0 split)
## hh < 156 to the right, agree=0.823, adj=0.038, (0 split)
##
## Node number 16: 60 observations
## mean=11.41679, MSE=0.0717158
##
## Node number 17: 72 observations
## mean=11.63986, MSE=0.0383361
##
## Node number 20: 177 observations
## mean=11.88051, MSE=0.03836444
##
## Node number 21: 138 observations
## mean=12.07006, MSE=0.06174722
##
## Node number 24: 15 observations
## mean=11.83923, MSE=0.1014901
##
## Node number 25: 40 observations
## mean=12.24475, MSE=0.05043865
##
## Node number 26: 115 observations
## mean=12.3179, MSE=0.08632885
##
## Node number 27: 26 observations
## mean=12.66274, MSE=0.1259914
##
## n= 803
##
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 803 154.6762000 12.05081
## 2) log_pc_rooms< 0.776981 543 56.5535100 11.86618
## 4) income< 2.56845 207 18.7460100 11.65933
## 8) log_pop_hh>=0.9273377 132 8.6916620 11.53846
## 16) income< 1.82915 60 4.3029480 11.41679 *
## 17) income>=1.82915 72 2.7601990 11.63986 *
## 9) log_pop_hh< 0.9273377 75 4.7317600 11.87206 *
## 5) income>=2.56845 336 23.4943400 11.99361
## 10) income< 5.53935 315 18.0977300 11.96355
## 20) log_pc_rooms< 0.6163027 177 6.7905050 11.88051 *
## 21) log_pc_rooms>=0.6163027 138 8.5211160 12.07006 *
## 11) income>=5.53935 21 0.8414734 12.44456 *
## 3) log_pc_rooms>=0.776981 260 40.9575100 12.43639
## 6) income< 5.7841 196 23.4796300 12.31208
## 12) age< 20.5 55 5.3338320 12.13415
## 24) income< 3.00695 15 1.5223520 11.83923 *
## 25) income>=3.00695 40 2.0175460 12.24475 *
## 13) age>=20.5 141 15.7253600 12.38149
## 26) log_pc_rooms< 0.9391865 115 9.9278180 12.31790 *
## 27) log_pc_rooms>=0.9391865 26 3.2757770 12.66274 *
## 7) income>=5.7841 64 5.1733980 12.81709 *
houses.train$rpart.train.fit.full.pred <-
predict(rpart.train.fit.full, data = houses.train)
# root-mean-squared for trees on training set
print(rmspe(houses.train$log_value, houses.train$rpart.train.fit.full.pred))
## [1] 0.2491952
# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rpart.train.fit.full.pred)^2)
## [1] 0.6776176
cat("\n\nTraining set proportion of variance accounted",
" for by tree-structured regression (full model) = ",
sprintf("%1.3f",cor(houses.train$log_value,
houses.train$rpart.train.fit.full.pred)^2),sep=" ")
##
##
## Training set proportion of variance accounted for by tree-structured regression (full model) = 0.678
# root-mean-squared for trees on test set
houses.test$rpart.train.fit.full.pred <- predict(rpart.train.fit.full,
newdata = houses.test)
print(rmspe(houses.test$log_value, houses.test$rpart.train.fit.full.pred))
## [1] 0.2924421
# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rpart.train.fit.full.pred)^2)
## [1] 0.5736942
cat("\n\nTest set proportion of variance accounted",
" for by tree-structured regression (full model) = ",
sprintf("%1.3f",cor(houses.test$log_value,
houses.test$rpart.train.fit.full.pred)^2),sep=" ")
##
##
## Test set proportion of variance accounted for by tree-structured regression (full model) = 0.574
# plot the regression tree result from rpart
#pdf(file = "fig_spatial_rpart_model_full.pdf", width = 8.5, height = 8.5)
prp(rpart.train.fit.full, main="",
digits = 3, # digits to display in terminal nodes
nn = TRUE, # display the node numbers
fallen.leaves = TRUE, # put the leaves on the bottom of the page
branch = 0.5, # change angle of branch lines
branch.lwd = 2, # width of branch lines
faclen = 0, # do not abbreviate factor levels
trace = 1, # print the automatically calculated cex
shadow.col = 0, # no shadows under the leaves
branch.lty = 1, # draw branches using dotted lines
split.cex = 1.2, # make the split text larger than the node text
split.prefix = "is ", # put "is" before split text
split.suffix = "?", # put "?" after split text
split.box.col = "blue", # lightgray split boxes (default is white)
split.col = "white", # color of text in split box
split.border.col = "blue", # darkgray border on split boxes
split.round = .25) # round the split box corners a tad
## cex 0.816 xlim c(0, 1) ylim c(0, 1)

#dev.off()
# --------------------------------------
# Random forests (simple)
# --------------------------------------
set.seed (9999) # for reproducibility
rf.train.fit <- randomForest(simple.model,
data=houses.train, mtry=3, importance=TRUE, na.action=na.omit)
# review the random forest solution
print(rf.train.fit)
##
## Call:
## randomForest(formula = simple.model, data = houses.train, mtry = 3, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.06991898
## % Var explained: 63.7
# check importance of the individual explanatory variables
# pdf(file = "fig_spatial_random_forest_simple_importance.pdf",
# width = 11, height = 8.5)
varImpPlot(rf.train.fit, main = "", pch = 20, col = "darkblue")

#dev.off()
# random forest predictions for the training set
houses.train$rf.train.fit.pred <- predict(rf.train.fit, type="class",
newdata = houses.train)
# root-mean-squared for random forest on training set
print(rmspe(houses.train$log_value, houses.train$rf.train.fit.pred))
## [1] 0.1146651
# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rf.train.fit.pred)^2)
## [1] 0.9452182
cat("\n\nTraining set proportion of variance accounted",
"for by random forests (simple model) = ",
sprintf("%1.3f",
cor(houses.train$log_value,houses.train$rf.train.fit.pred)^2),sep=" ")
##
##
## Training set proportion of variance accounted for by random forests (simple model) = 0.945
# random forest predictions for the test set using model from training set
houses.test$rf.train.fit.pred <- predict(rf.train.fit,
type="class", newdata = houses.test)
# root-mean-squared for random forest on test set
print(rmspe(houses.test$log_value, houses.test$rf.train.fit.pred))
## [1] 0.2942473
# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rf.train.fit.pred)^2)
## [1] 0.5646507
cat("\n\nTest set proportion of variance accounted",
" for by random forests (simple model) = ",
sprintf("%1.3f",
cor(houses.test$log_value,houses.test$rf.train.fit.pred)^2),sep=" ")
##
##
## Test set proportion of variance accounted for by random forests (simple model) = 0.565
# --------------------------------------
# Random forests (full)
# --------------------------------------
set.seed (9999) # for reproducibility
rf.train.fit.full <- randomForest(full.model,
data=houses.train, mtry=3, importance=TRUE, na.action=na.omit)
# review the random forest solution
print(rf.train.fit.full)
##
## Call:
## randomForest(formula = full.model, data = houses.train, mtry = 3, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 0.06328609
## % Var explained: 67.15
# check importance of the individual explanatory variables
#pdf(file = "fig_spatial_random_forest_full_importance.pdf",
#width = 11, height = 8.5)
varImpPlot(rf.train.fit.full, main = "", pch = 20,
cex = 1.25, col = "darkblue", lcolor = "black")

#dev.off()
# random forest predictions for the training set
houses.train$rf.train.fit.full.pred <- predict(rf.train.fit.full, type="class",
newdata = houses.train)
# root-mean-squared for random forest on training set
print(rmspe(houses.train$log_value, houses.train$rf.train.fit.full.pred))
## [1] 0.1092729
# report R-squared on training data
print(cor(houses.train$log_value,houses.train$rf.train.fit.full.pred)^2)
## [1] 0.9486865
cat("\n\nTraining set proportion of variance accounted",
" for by random forests (full model) = ",
sprintf("%1.3f",cor(houses.train$log_value,
houses.train$rf.train.fit.full.pred)^2),sep=" ")
##
##
## Training set proportion of variance accounted for by random forests (full model) = 0.949
# random forest predictions for the test set using model from training set
houses.test$rf.train.fit.full.pred <- predict(rf.train.fit.full, type="class",
newdata = houses.test)
# root-mean-squared for random forest on test set
print(rmspe(houses.test$log_value, houses.test$rf.train.fit.full.pred))
## [1] 0.263197
# report R-squared on training data
print(cor(houses.test$log_value,houses.test$rf.train.fit.full.pred)^2)
## [1] 0.6511199
cat("\n\nTest set proportion of variance accounted",
" for by random forests (full model) = ",
sprintf("%1.3f",cor(houses.test$log_value,
houses.test$rf.train.fit.full.pred)^2),sep=" ")
##
##
## Test set proportion of variance accounted for by random forests (full model) = 0.651
# --------------------------------------
# Geographically weighted regression
# --------------------------------------
# bandwidth calculation may take a while
set.bandwidth <- gwr.sel(pace.barry.model,
data=houses.train, verbose = FALSE, show.error.messages = FALSE)
# fit the geographically-weighted regression with bandwidth value set.bandwidth
gwr.train.fit <- gwr(pace.barry.model, bandwidth = set.bandwidth,
predictions = TRUE, data=houses.train, fit.points = houses.train)
# extract training set predictions
houses.train$grw.train.fit.pred <- gwr.train.fit$SDF$pred
# root-mean-squared for grw on training set
print(rmspe(houses.train$log_value, houses.train$grw.train.fit.pred))
## [1] 0.2343306
# report R-squared on training data
print(cor(houses.train$log_value,houses.train$grw.train.fit.pred)^2)
## [1] 0.715599
cat("\n\nTraining set proportion of variance accounted",
" for by geographically-weighted regression = ",
sprintf("%1.3f",cor(houses.train$log_value,
houses.train$grw.train.fit.pred)^2),sep=" ")
##
##
## Training set proportion of variance accounted for by geographically-weighted regression = 0.716
# fit the geographically-weighted regression with bandwidth value set.bandwidth
# fit to training data and specify test data
gwr.train.fit <- gwr(pace.barry.model, bandwidth = set.bandwidth,
predictions = TRUE, data=houses.train, fit.points = houses.test)
# extract test set predictions
houses.test$grw.train.fit.pred <- gwr.train.fit$SDF$pred
# root-mean-squared for grw on test set
print(rmspe(houses.test$log_value, houses.test$grw.train.fit.pred))
## [1] 0.273895
# report R-squared on training data
print(cor(houses.test$log_value,houses.test$grw.train.fit.pred)^2)
## [1] 0.6265548
cat("\n\nTest set proportion of variance accounted",
" for by geographically-weighted regression = ",
sprintf("%1.3f",cor(houses.test$log_value,
houses.test$grw.train.fit.pred)^2),sep=" ")
##
##
## Test set proportion of variance accounted for by geographically-weighted regression = 0.627
# --------------------------------------
# Gather results for a single report
# --------------------------------------
# measurement model performance summary
methods <- c("Linear regression Pace and Barry (1997)",
"Tree-structured regression (simple model)",
"Tree-structured regression (full model)",
"Random forests (simple model)",
"Random forests (full model)",
"Geographically weighted regression (GWR)")
methods.performance.data.frame <- data.frame(methods)
methods.performance.data.frame$training <-
c(round(cor(houses.train$log_value,predict(pace.barry.train.fit))^2
,digits=3),
round(cor(houses.train$log_value,
houses.train$rpart.train.fit.pred)^2,digits=3),
round(cor(houses.train$log_value,
houses.train$rpart.train.fit.full.pred)^2,digits=3),
round(cor(houses.train$log_value,
houses.train$rf.train.fit.pred)^2,digits=3),
round(cor(houses.train$log_value,
houses.train$rf.train.fit.full.pred)^2,digits=3),
round(cor(houses.train$log_value,
houses.train$grw.train.fit.pred)^2,digits=3))
methods.performance.data.frame$test <-
c(round(cor(houses.test$log_value,
predict(pace.barry.train.fit, newdata = houses.test))^2,digits=3),
round(cor(houses.test$log_value,
houses.test$rpart.train.fit.pred)^2,digits=3),
round(cor(houses.test$log_value,
houses.test$rpart.train.fit.full.pred)^2,digits=3),
round(cor(houses.test$log_value,
houses.test$rf.train.fit.pred)^2,digits=3),
round(cor(houses.test$log_value,
houses.test$rf.train.fit.full.pred)^2,digits=3),
round(cor(houses.test$log_value,
houses.test$grw.train.fit.pred)^2,digits=3))
print(methods.performance.data.frame)
## methods training test
## 1 Linear regression Pace and Barry (1997) 0.648 0.562
## 2 Tree-structured regression (simple model) 0.601 0.404
## 3 Tree-structured regression (full model) 0.678 0.574
## 4 Random forests (simple model) 0.945 0.565
## 5 Random forests (full model) 0.949 0.651
## 6 Geographically weighted regression (GWR) 0.716 0.627