Data source
The price data were for properties in a selected area in Seattle, WA, that were sold on the market within the past 1 year. House types of interest are single house, townhouse and condo. The data can be downloaded from here, applying a proper filter on the house type and sales record period. The website apparently applies a upper limit for the number of records shown simultaneously, therefore I only downloaded a limited size of data for preliminary data analysis. I have been trying to use automated process such as RSelenium to download more data, although there were issues to be solved.
I’ve also explored the demographic data for given zip codes from using Zillow API, but it seemed that the statistics are not specific to each different zip code. It might be different if the area of interest is larger.
Prediction model
Outcome: sale price
Explanatory variables: home type, zip code, # bedrooms, # bathrooms, square feet, year built, parking spots, parking type
Final models can have various transformations of the given explanatory variables.
For now I’ve only tried Lasso regression and support vector regression. The metric used to evaluate model performance is mean squared error (MSE). There seemed to be non-linear relationship between the outcome and explanatory variables, so I used radial basis function (RBF) kernel in the SV regression. I used a 80%/20% training/testing data split, and 10-fold cross validation for building each model.
suppressPackageStartupMessages(library(dplyr))
#== Data loading ==#
house <- read.csv('house.csv')
condo <- read.csv('condo.csv')
table(house$HOME.TYPE)
##
## Condo/Coop Multi-Family (2-4 Unit)
## 4 4
## Single Family Residential Townhouse
## 634 272
table(condo$HOME.TYPE)
##
## Condo/Coop
## 636
#select only house and townhouse
house = house[house$HOME.TYPE %in% c('Single Family Residential','Townhouse'),]
#combine the two datasets
property <- tbl_df(rbind(house,condo))
names(property) <- tolower(names(property))
table(property$city)
##
## Seattle SEATTLE
## 1540 1 1
table(property$state)
##
## WA
## 1542
All in Seattle, WA, therefore this two variables are not informative.
#select variables of interest
property_sub <- property %>%
mutate(zip_f = as.factor(zip)) %>%
select(home.type,zip_f,beds,baths,location,sqft,lot.size,year.built,parking.spots,parking.type,latitude,longitude,last.sale.price)
#make a factorized zipcode
property_sub <- property_sub[complete.cases(select(property_sub,-lot.size)),] #lot.size is meaningful only for houses/townhouses
#drop unused levels
property_sub <- droplevels(property_sub)
#== Cleaning ==#
table(property_sub$home.type)
##
## Condo/Coop Single Family Residential
## 552 530
## Townhouse
## 263
table(property_sub$zip_f)
##
## 9112 98101 98102 98104 98109 98112 98119 98121 98122 98133 98136 98144
## 1 162 164 34 51 247 45 40 484 1 1 115
table(property_sub$location)
##
## Arboretum Broadmoor Capitol Hill
## 16 14 451
## Central Area Denny Blaine Downtown
## 152 16 105
## First Hill Judkins Leschi
## 104 34 56
## Madison Park Madison Valley Madrona
## 19 83 71
## Montlake Mt Baker N Beacon Hill
## 3 1 1
## North Capitol Hill Queen Anne Seattle
## 27 90 36
## Washington Park Belltown Broadway
## 22 1 1
## Denny Triangle South Lake Union
## 33 9
#remove records with <5 obs of zip/location for more stable inference
in_zip <- property_sub %>% count(zip_f) %>% filter(n>=5) %>% select(zip_f) %>% unlist
in_loc <- property_sub %>% count(location) %>% filter(n>=5) %>% select(location) %>% unlist
property_sub <- filter(property_sub,zip_f %in% in_zip,location %in% in_loc)
#check zip code
library(zipcode)
data(zipcode)
ref_zip = subset(zipcode, city=='Seattle'&state=='WA')
ref_zip = ref_zip[,c("zip","latitude","longitude")]
names(ref_zip) <- c('zip_f','ref_lati','ref_long')
zip_chk = merge(property_sub,ref_zip,by='zip_f')
zip_chk %>% filter(abs(latitude-ref_lati)>0.05 | abs(longitude-ref_long)>0.05) #seems to be all okay
## [1] zip_f home.type beds baths
## [5] location sqft lot.size year.built
## [9] parking.spots parking.type latitude longitude
## [13] last.sale.price ref_lati ref_long
## <0 rows> (or 0-length row.names)
#beds and baths
table(property_sub$beds)
##
## 0 1 2 3 4 5 6 7 8
## 53 331 349 350 178 64 6 3 1
table(property_sub$baths)
##
## 0.75 1 1.25 1.5 1.75 2 2.25 2.5 2.75 3 3.25 3.5 3.75 4 4.25
## 15 413 13 114 146 154 96 141 35 65 49 56 12 8 5
## 4.5 4.75 5 5.75
## 5 2 4 2
property_sub <- filter(property_sub,beds<=7,baths<=5) #remove records with >7 bedrooms or >5 bathrooms
#square feet
hist(property_sub$sqft)
range(property_sub$sqft,na.rm = T)
## [1] 316 8920
hist(log(property_sub$sqft)) #probably better to use log transformed sqft to decrease leverage of extreme values
#lot.size
hist(property_sub$lot.size)
range(property_sub$lot.size,na.rm = T)
## [1] 1 125361
property_sub$lot.size[property_sub$lot.size==1] <- NA #set obs with lot.size==1 to be NA
hist(log(property_sub$lot.size)) #probably better to use log transformed lot.size
#year built
hist(property_sub$year.built)
#parking.spots
table(property_sub$parking.spots)
##
## 0 1 2 3 4 5 6 8
## 86 985 246 7 5 1 1 1
property_sub <- filter(property_sub,parking.spots<=4) #remove records with >4 parking spots
#parking type
table(property_sub$parking.type)
##
## Garage
## 468 861
levels(property_sub$parking.type)[1] <- 'other' #non-garage parking
#create tranformed variables
property_sub <- property_sub %>%
mutate(log_sqft = log(sqft),log_lotsize = log(lot.size)) %>%
select(-longitude,-latitude)
#drop unused levels
property_sub <- droplevels(property_sub)
#== Download demographic data from Zillow for each zipcode ==#
library(RCurl)
## Loading required package: bitops
library(XML)
zipcodes = levels(property_sub$zip_f)
#zid = "***********" #my zillow ID
demo = data.frame()
for(z in zipcodes){
reply = getForm("http://www.zillow.com/webservice/GetDemographics.htm",
'zws-id' = zid,
zip = z)
node = xmlParse(reply)
root = xmlRoot(node)
subnode = getNodeSet(root,'//page//table//data')
demo_tmp = xmlToDataFrame(subnode[[length(subnode)]],stringsAsFactors = F)
demo_tmp_df = as.data.frame(cbind(z,t(demo_tmp[2])))
names(demo_tmp_df) = c("zip",demo_tmp[[1]])
demo = rbind(demo,demo_tmp_df)
}
demo
## zip Median Household Income Single Males Single Females
## values 98101 44512.0130806292 0.146462187349365 0.124578258618535
## values1 98102 44512.0130806292 0.146462187349365 0.124578258618535
## values2 98104 44512.0130806292 0.146462187349365 0.124578258618535
## values3 98109 44512.0130806292 0.146462187349365 0.124578258618535
## values4 98112 44512.0130806292 0.146462187349365 0.124578258618535
## values5 98119 44512.0130806292 0.146462187349365 0.124578258618535
## values6 98121 44512.0130806292 0.146462187349365 0.124578258618535
## values7 98122 44512.0130806292 0.146462187349365 0.124578258618535
## values8 98144 44512.0130806292 0.146462187349365 0.124578258618535
## Median Age Homes With Kids Average Household Size
## values 36 0.313623902816284 2.58883240001203
## values1 36 0.313623902816284 2.58883240001203
## values2 36 0.313623902816284 2.58883240001203
## values3 36 0.313623902816284 2.58883240001203
## values4 36 0.313623902816284 2.58883240001203
## values5 36 0.313623902816284 2.58883240001203
## values6 36 0.313623902816284 2.58883240001203
## values7 36 0.313623902816284 2.58883240001203
## values8 36 0.313623902816284 2.58883240001203
## Average Commute Time (Minutes)
## values 26.375545725891282
## values1 26.375545725891282
## values2 26.375545725891282
## values3 26.375545725891282
## values4 26.375545725891282
## values5 26.375545725891282
## values6 26.375545725891282
## values7 26.375545725891282
## values8 26.375545725891282
#it turned out that the demographic statistics from Zillow are the same across different zipcodes
#therefore this information is not useful for my analysis
It turned out that the demographic statistics from Zillow are the same across different zipcodes, therefore this information is not useful for my analysis.
#== Exploratory analysis ==#
pairs(property_sub)
Split the data set into training set and testing set with a proportion of 80% vs 20%. Use 10-fold cross-validation to find the best model.
#== Build model ==#
set.seed(123)
n = dim(property_sub)[1]
inTrain = sample(1:n,size = round(0.8*n))
training = property_sub[inTrain,]
testing = property_sub[-inTrain,]
#check home type distribution
home_type_frac = table(property_sub$home.type)
home_type_frac/sum(home_type_frac)
##
## Condo/Coop Single Family Residential
## 0.4123401 0.3905192
## Townhouse
## 0.1971407
home_type_frac = table(training$home.type)
home_type_frac/sum(home_type_frac) #approximately the same proportions
##
## Condo/Coop Single Family Residential
## 0.4158043 0.3960489
## Townhouse
## 0.1881468
#Normalize continuous variables. Note testing data use the same center and scale as training data.
train_scale = scale(training[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")])
training[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")] <- train_scale
y_train = scale(training$last.sale.price)
testing[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")] =
scale(testing[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")],
center = attr(train_scale,'scaled:center'),
scale = attr(train_scale,'scaled:scale'))
y_test = scale(testing$last.sale.price,
center = attr(y_train,'scaled:center'),
scale = attr(y_train,'scaled:scale'))
#try lasso
library(glmnet)
## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2
x_train = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,training)
lambda_grids = 10^seq(-5,1,length.out = 100)
set.seed(999)
lasso_cv = cv.glmnet(x_train,y_train,nfolds = 10,alpha=1,lambda = lambda_grids)
plot(lasso_cv)
coef(lasso_cv,s='lambda.min',exact=TRUE)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.0875822645
## home.typeCondo/Coop 0.5109954733
## home.typeSingle Family Residential .
## home.typeTownhouse .
## zip_f98102 -0.0855771713
## zip_f98104 -0.1057240934
## zip_f98109 0.1185561214
## zip_f98112 -0.0036294879
## zip_f98119 0.2651349126
## zip_f98121 -0.1833625865
## zip_f98122 -0.2387955922
## zip_f98144 -0.3931313407
## beds -0.2917933892
## baths 0.1891284207
## log_sqft 0.9279420637
## year.built -0.0251512915
## parking.spots 0.1058497304
## parking.typeGarage 0.0009431958
min(lasso_cv$cvm)
## [1] 0.3544406
lasso_fit = glmnet(x_train,y_train,alpha = 1,lambda = lasso_cv$lambda.min)
x_test = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,testing)
y_pred = predict(lasso_fit,x_test)
mse_lasso = sum((y_pred-y_test)^2)/length(y_test)
plot(y_test,y_pred,xlab="True normalized price",ylab="Predicted normalized price",
main="Prediction using Lasso regression")
text(-1,3,substitute(r^2 == r2,list(r2=cor(y_test,y_pred))),adj=0)
text(-1,2.7,substitute(MSE == r2,list(r2=mse_lasso)),adj=0)
abline(0,1)
There is apparent non-linear relationship between the predicted and true values. From the exploratory analysis there seems to be a non-linear relationship between the price and log square feet, try adding squared term for log_sqft.
x_train = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+I(log_sqft^2)+year.built+parking.spots+parking.type,training)
x_test = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+I(log_sqft^2)+year.built+parking.spots+parking.type,testing)
set.seed(999)
lasso_cv = cv.glmnet(x_train,y_train,nfolds = 10,alpha=1,lambda = lambda_grids)
plot(lasso_cv)
coef(lasso_cv,s='lambda.min',exact=TRUE)
## 19 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.05864547
## home.typeCondo/Coop 0.19941724
## home.typeSingle Family Residential -0.20579566
## home.typeTownhouse .
## zip_f98102 -0.25283819
## zip_f98104 -0.16385243
## zip_f98109 .
## zip_f98112 -0.20530237
## zip_f98119 0.02084547
## zip_f98121 -0.20497168
## zip_f98122 -0.32454323
## zip_f98144 -0.44575848
## beds -0.21567482
## baths 0.01404911
## log_sqft 0.98418457
## I(log_sqft^2) 0.28439123
## year.built 0.06267666
## parking.spots 0.08060995
## parking.typeGarage 0.01101055
min(lasso_cv$cvm)
## [1] 0.2807008
lasso_fit = glmnet(x_train,y_train,alpha = 1,lambda = lasso_cv$lambda.min)
y_pred = predict(lasso_fit,x_test)
mse_lasso = sum((y_pred-y_test)^2)/length(y_test)
plot(y_test,y_pred,xlab="True normalized price",ylab="Predicted normalized price",
main="Figure 1. Prediction using Lasso regression")
mtext("add squared log_sqft", 3, line=0.5,cex=1.2)
text(-1,4,substitute(r^2 == r2,list(r2=cor(y_test,y_pred))),adj=0)
text(-1,3.7,substitute(MSE == r2,list(r2=mse_lasso)),adj=0)
abline(0,1)
Use RBF kernel to account for the non-linear relationship.
#try SVM regression (SVR)
library(e1071)
x_train = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,training)
x_test = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,testing)
set.seed(666)
svr_cv = tune(svm,x_train,y_train,kernel='radial',gamma=1,scale=F,type = 'eps-regression',cross=10,
ranges = list(epsilon=c(0.05,0.1,0.2),cost=c(.5,1,2,5,10)))
svr_cv$best.parameters
## epsilon cost
## 11 0.1 5
set.seed(555) #fine tuning
svr_cv = tune(svm,x_train,y_train,kernel='radial',gamma=1,scale=F,type = 'eps-regression',cross=10,
ranges = list(epsilon=c(0.08,0.1,0.12),cost=c(3,5,8)))
svr_cv$best.parameters
## epsilon cost
## 4 0.08 5
y_pred_svr = predict(svr_cv$best.model,x_test)
mse_svr = sum((y_pred_svr-y_test)^2)/length(y_test)
plot(y_test,y_pred_svr,xlab="True normalized price",ylab="Predicted normalized price",
main="Figure 2. Prediction using Support Vector regression")
text(-1,2.8,substitute(r^2 == r2,list(r2=cor(y_test,y_pred_svr))),adj=0)
text(-1,2.6,substitute(MSE == r2,list(r2=mse_svr)),adj=0)
abline(0,1)