Melbourne Housing Market - Tree Based Analysis
This research is a first attempt at implementing a tree-based model on Melbourne housing data collected by Secret Agent in 2016/17. First, we will load and prepare the data:
options(warn=-1)
require(googlesheets)
## Loading required package: googlesheets
require(magrittr)
## Loading required package: magrittr
apt <- gs_key("1Hw2LTg_-vucgYVu0_JN1dFMtVwRFQtnY7BRnxjybPbk") %>% gs_read(ws = 1)
## Sheet successfully identified: "PPSM Master Index 2016"
## Accessing worksheet titled 'Apartments'.
## No encoding supplied: defaulting to UTF-8.
str(apt)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1674 obs. of 15 variables:
## $ Address : chr "8/5-7 Princes St" "7/20 Abbotsford St" "3/92 Charles St" "2/89 Stafford St" ...
## $ Suburb : chr "Abbotsford" "Abbotsford" "Abbotsford" "Abbotsford" ...
## $ Sale Date : chr "08/2016" "05/2016" "06/2016" "09/2016" ...
## $ Price : chr "$312,000" "$441,000" "$340,000" "$337,500" ...
## $ Outdoor Area : num NA 3.1 22.3 NA 3.5 7.4 15 6 5.7 0 ...
## $ Internal Floor Area : num 36.9 44.4 35.1 31 43.4 48.1 42.1 46.1 45.5 51.6 ...
## $ Price Per SQM : chr "$8,455.28" "$9,932.43" "$9,686.61" "$10,887.10" ...
## $ PPSM (outdoor plus indoor): chr NA "$9,284" "$5,923" NA ...
## $ Bed : chr "1" "1" "1" "1" ...
## $ Bath : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Car : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Type : chr "60s-80s" "60s-80s" "60s-80s" "60s-80s" ...
## $ Notes : chr "1 of 12" "Top floor" "Private courtyard" NA ...
## $ Other : chr NA NA NA NA ...
## $ Region : chr NA NA NA NA ...
We can see that the Apartment Spreadsheet has nearly 1700 observations of 15 variables, although not all of these are relevant. First, we need to format the variables so they can be interpreted by our model.
#delete rows where no suburb is indicated
apt <- subset(apt, !is.na(apt$Suburb))
#delete calculated/derived columns
drops <- c("Price Per SQM","PPSM (outdoor plus indoor)")
apt <- apt[,!(names(apt) %in% drops)]
#format price as number
apt$Price <- as.numeric(lapply(apt$Price,gsub,pattern="[//$//,]",replacement=""))
#format Bath, Car as numeric
apt$Bath <- as.numeric(apt$Bath)
apt$Car <- as.numeric(apt$Car)
#format Bed, Type and Suburb as factor
apt$Bed <- factor(apt$Bed)
apt$Type <- factor(apt$Type)
apt$Suburb <- factor(apt$Suburb)
#make names easier to use
names(apt)[c(3,5,6)] <- c("Date","Outdoor","Indoor")
#convert Sale Date to date value (double)
apt$Date <- strsplit(apt$Date,"/")
apt$Date <- lapply(apt$Date, function(x) {
x <- tail(x, n = 2)
x <- paste(x,collapse = '-')
})
apt$Date <- as.Date(paste("15-",apt$Date,sep = ''), format = "%d-%m-%Y")
#force use of certain variable as reference in dummy:
#DF <- within(DF, b <- relevel(b, ref = 3))
#source: Gavin Simpson. Link: http://stackoverflow.com/questions/3872070/how-to-force-r-to-use-a-specified-factor-level-as-reference-in-a-regression
apt <- within(apt, Suburb <- relevel(Suburb, ref = 18))
#add missing data
apt$Outdoor[apt$Address=="1403/505 St Kilda"] <- 24.2
apt$Outdoor[apt$Address=="2602/80 Clarendon"] <- 50.5
apt$Outdoor[apt$Address=="8/144 George"] <- 6
apt$Outdoor[apt$Address=="1101/279 Wellington"] <- 0
#remove NAs and create Balcony dummy variable
apt <- apt[!is.na(apt$Outdoor),]
apt <- apt[!is.na(apt$Indoor),]
apt$Balcony <- ifelse(apt$Outdoor>0,T,F)
str(apt)
## Classes 'tbl_df', 'tbl' and 'data.frame': 804 obs. of 14 variables:
## $ Address: chr "7/20 Abbotsford St" "3/92 Charles St" "8/18 Nicholson St" "11/205 Gipps St" ...
## $ Suburb : Factor w/ 30 levels "Melbourne","Abbotsford",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Date : Date, format: "2016-05-15" "2016-06-15" ...
## $ Price : num 441000 340000 360000 470000 420000 ...
## $ Outdoor: num 3.1 22.3 3.5 7.4 15 6 5.7 0 16.6 0 ...
## $ Indoor : num 44.4 35.1 43.4 48.1 42.1 46.1 45.5 51.6 45.4 50.3 ...
## $ Bed : Factor w/ 5 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 2 ...
## $ Bath : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Car : num 1 1 1 1 1 1 1 1 1 1 ...
## $ Type : Factor w/ 6 levels "60s-80s","Art Deco",..: 1 1 1 3 3 3 3 6 6 1 ...
## $ Notes : chr "Top floor" "Private courtyard" NA "designed by Kann Finch" ...
## $ Other : chr NA NA NA NA ...
## $ Region : chr NA NA NA NA ...
## $ Balcony: logi TRUE TRUE TRUE TRUE TRUE TRUE ...
Cleaning the data wittled our observations down to 800 usable data points. Next, we will create our basic tree:
Tree-Based Decision Model
require(tree)
## Loading required package: tree
set.seed(99)
#remove columns not used for predictions
vars <- apt[,!(names(apt) %in% names(apt)[c(1,11:14)])]
#show the shape of Price Variable Data (positively skewed)
hist(vars$Price)

#create dummy variable and training set
High <- ifelse(vars$Price>=1000000,"Yes","No")
vars <- data.frame(vars,High)
Now we fit a tree to these data and summarize and plot it. Notice that we have to exclude Price from the model because the response is derived from it.
tree.apt <- tree(High~.-Price,data=vars)
summary(tree.apt)
##
## Classification tree:
## tree(formula = High ~ . - Price, data = vars)
## Variables actually used in tree construction:
## [1] "Indoor" "Outdoor" "Suburb"
## Number of terminal nodes: 9
## Residual mean deviance: 0.1178 = 93.69 / 795
## Misclassification error rate: 0.02239 = 18 / 804
tree.apt
## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 804 489.40 No ( 0.909204 0.090796 )
## 2) Indoor < 98.45 688 49.16 No ( 0.994186 0.005814 )
## 4) Outdoor < 52.45 674 27.27 No ( 0.997033 0.002967 )
## 8) Suburb: Melbourne,Abbotsford,Albert Park,Brunswick,Brunswick East,Burnley,Carlton North,Clifton Hill,Collingwood,Cremorne,Docklands,East Melbourne,Fitzroy,Fitzroy North,Flemington,Hawthorn,Kensington,Middle Park,North Melbourne,Northcote,Parkville,Port Melbourne,Prahran,Richmond,South Melbourne,Southbank,Travancore,West Melbourne 608 0.00 No ( 1.000000 0.000000 ) *
## 9) Suburb: Carlton,South Yarra 66 17.92 No ( 0.969697 0.030303 ) *
## 5) Outdoor > 52.45 14 11.48 No ( 0.857143 0.142857 ) *
## 3) Indoor > 98.45 116 156.60 Yes ( 0.405172 0.594828 )
## 6) Indoor < 123.75 58 71.85 No ( 0.689655 0.310345 )
## 12) Suburb: Melbourne,Abbotsford,Brunswick,Brunswick East,Carlton,Collingwood,Docklands,Fitzroy,Kensington,North Melbourne,Northcote,Parkville,Prahran,Richmond,Southbank,Travancore,West Melbourne 46 45.48 No ( 0.804348 0.195652 )
## 24) Suburb: Brunswick East,Collingwood,Fitzroy,Kensington,North Melbourne,Northcote,Parkville,Prahran,Travancore,West Melbourne 12 0.00 No ( 1.000000 0.000000 ) *
## 25) Suburb: Melbourne,Abbotsford,Brunswick,Carlton,Docklands,Richmond,Southbank 34 39.30 No ( 0.735294 0.264706 ) *
## 13) Suburb: Albert Park,East Melbourne,Hawthorn,Port Melbourne,South Melbourne,South Yarra 12 13.50 Yes ( 0.250000 0.750000 ) *
## 7) Indoor > 123.75 58 42.72 Yes ( 0.120690 0.879310 )
## 14) Suburb: Brunswick East,Clifton Hill,Fitzroy North,Richmond,West Melbourne 5 0.00 No ( 1.000000 0.000000 ) *
## 15) Suburb: Melbourne,Albert Park,Brunswick,Carlton,Collingwood,Docklands,East Melbourne,Fitzroy,Hawthorn,Parkville,Port Melbourne,Prahran,South Melbourne,South Yarra,Southbank 53 17.03 Yes ( 0.037736 0.962264 )
## 30) Indoor < 134.7 14 11.48 Yes ( 0.142857 0.857143 ) *
## 31) Indoor > 134.7 39 0.00 Yes ( 0.000000 1.000000 ) *
Although not my favourite way of displaying a tree, we can see that having indoor space of less than about 100 square metres always leads to the apartment being classified as <$1 million, while apartments with indoor space greater than 135 are always classified as >$1 million. In between these two variables, the decision is based on which suburb the apartment is located in. Let’s create a training and test set (500,304) split of the 804 observations, grow the tree on the training set. and evaluate its performance on the test set.
set.seed(99)
train <- sample(1:nrow(vars),500)
tree.apt <- tree(High~.-Price,vars,subset=train)
plot(tree.apt);text(tree.apt,pretty=0,cex=0.5)

tree.pred <- predict(tree.apt,vars[-train,],type="class")
with(vars[-train,],table(tree.pred,High))
## High
## tree.pred No Yes
## No 263 15
## Yes 10 16
(263+16)/304
## [1] 0.9177632
The plot displays the decisions made by the model nicely. While a correct prediction rate of 91.78% might look very impressive, if we simply guessed No for each prediction, our success rate would be \(278/304 = 91.45\%\), so the model is only a tiny 0.33% improvement. The error rate on correcly versus falsely predicing apartments that sold for >$1 million is a measly \(16/26 = 60\%\). This tree was grown to full depth, and might be too variable. We now use CV to prune it.
cv.apt <- cv.tree(tree.apt,FUN=prune.misclass)
cv.apt
## $size
## [1] 5 3 1
##
## $dev
## [1] 29 29 45
##
## $k
## [1] -Inf 2.5 13.0
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.apt)

prune.apt <- prune.misclass(tree.apt,best=3)
plot(prune.apt);text(prune.apt,pretty=0,cex=0.5)

prune.pred <- predict(prune.apt,vars[-train,],type="class")
with(vars[-train,],table(prune.pred,High))
## High
## prune.pred No Yes
## No 256 8
## Yes 17 23
(256+23)/304
## [1] 0.9177632
We can see that the pruned tree produced similar results, but reduced the number of terminal nodes from 11 down to just 3.
Random Forests
RF builds lots of bushy trees, then averages them to reduce the variance. Lets fit a random forest and see how well it performs. We will use the response price, the dwelling sales price (in dollars)
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
rf.apt <- randomForest(Price~.,data=vars,subset=train)
rf.apt
##
## Call:
## randomForest(formula = Price ~ ., data = vars, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 3
##
## Mean of squared residuals: 15950147604
## % Var explained: 86.05
plot(rf.apt)

We can see that the mean squared error (based on out of bag (OOB) estimates, or the 1/3rd of data not used in each tree model) falls very quickly over the first 100 iterations, but after this the error rate barely changes. This is likely as close as the RF model can get to irreductible error. The model reports that mtry=3, which is the number of variables randomly chosen at each split. Since \(p=8\) here, we could try all 8 possible values of the mtry. We will do so, and record results and make a plot.
oob.err <- double(8)
test.err <- double(8)
for(mtry in 1:8){
fit <- randomForest(Price~.,data=vars,subset=train,mtry=mtry,ntree=500)
oob.err[mtry] <- fit$mse[500]
pred <- predict(fit,vars[-train,])
test.err[mtry] <- with(vars[-train,],mean((Price-pred)^2))
cat(mtry," ")
}
## 1 2 3 4 5 6 7 8
matplot(1:mtry,cbind(test.err,oob.err),pch=19,col=c("red","blue"),type="b",ylab="Mean Squared Error")
legend("topright",legend=c("OOB","Test"),pch=19,col=c("red","blue"))

It looks like the test error is smallest when \(mtry=3\). After this the MSE is about the same, so we will use the simplest model again. Next let’s see how accurate our predictions are:
fit <- randomForest(Price~.,data=vars,subset=train,mtry=3,ntree=500)
pred <- predict(fit,vars[-train,])
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
ggplot(data=vars[train,],aes(x=log(Indoor), y=log(Price), color= log(fit$predicted)) ) +
geom_point()

summary(sqrt((vars$Price[-train]-pred)^2))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 477.4 28240.0 57860.0 80280.0 99550.0 1563000.0
Looks like on average, our predictions of apartment prices are within $94,100, with an interquartile range between $30,000 and $100,000 (50% of the data falls between these two regions, and 75% is below the Q3 value). However, we can see that the error for the remaining 25% above Q3 quickly spirals out of control, with the highest prediciton off by $1.6 million.
Main Takeaways
It looks like the random forest method modelled the data quite accurately, explaining about 80% of variance in price. However, as always there are outliers where actual and predicted prices are very far apart, which hints at the fact that there are 1) other factors that may significantly impact sales price (e.g. local amenities, age of aprtment/building etc) and 2 that some variation in sales prices cannot be explained rationally.
Also, one must keep in mind that while the model very accurately predicts current and past prices, it tells us nothing about future trends and using it to forecast prices would only work if we assume ceteris paribus, i.e. everything else (demand, supply and so on) remains constant.