House price prediction

The goal of this project is to use house specifics to predict its market price, using data downloaded from Redfin.com.

Data source
The price data were for properties in a selected area in Seattle, WA, that were sold on the market within the past 1 year. House types of interest are single house, townhouse and condo. The data can be downloaded from here, applying a proper filter on the house type and sales record period. The website apparently applies a upper limit for the number of records shown simultaneously, therefore I only downloaded a limited size of data for preliminary data analysis. I have been trying to use automated process such as RSelenium to download more data, although there were issues to be solved.
I’ve also explored the demographic data for given zip codes from using Zillow API, but it seemed that the statistics are not specific to each different zip code. It might be different if the area of interest is larger.
Prediction model
Outcome: sale price
Explanatory variables: home type, zip code, # bedrooms, # bathrooms, square feet, year built, parking spots, parking type
Final models can have various transformations of the given explanatory variables.
For now I’ve only tried Lasso regression and support vector regression. The metric used to evaluate model performance is mean squared error (MSE). There seemed to be non-linear relationship between the outcome and explanatory variables, so I used radial basis function (RBF) kernel in the SV regression. I used a 80%/20% training/testing data split, and 10-fold cross validation for building each model.
Further work

Expand the data set to include more samples, especially other cities of the Greater Seattle area.
Try other prediction models such as tree based models.
Explore other information from using Zillow API that may be useful for prediction.
After finding the best prediction model, determine and interpret the relationship between each variable and the property price.
Expand the time period to years ago, and explore the trend in house price and the impact of each explanatory variable on the trend. This may require use of time series models.

Load data

suppressPackageStartupMessages(library(dplyr))

#== Data loading ==#

house <- read.csv('house.csv')
condo <- read.csv('condo.csv')

table(house$HOME.TYPE)

## 
##                Condo/Coop   Multi-Family (2-4 Unit) 
##                         4                         4 
## Single Family Residential                 Townhouse 
##                       634                       272

table(condo$HOME.TYPE)

## 
## Condo/Coop 
##        636

#select only house and townhouse
house = house[house$HOME.TYPE %in% c('Single Family Residential','Townhouse'),]

#combine the two datasets
property <- tbl_df(rbind(house,condo)) 
names(property) <- tolower(names(property))

table(property$city)

## 
## Seattle SEATTLE         
##    1540       1       1

table(property$state)

## 
##   WA 
## 1542

All in Seattle, WA, therefore this two variables are not informative.

#select variables of interest
property_sub <- property %>%
    mutate(zip_f = as.factor(zip)) %>%
    select(home.type,zip_f,beds,baths,location,sqft,lot.size,year.built,parking.spots,parking.type,latitude,longitude,last.sale.price)
#make a factorized zipcode

property_sub <- property_sub[complete.cases(select(property_sub,-lot.size)),]    #lot.size is meaningful only for houses/townhouses

#drop unused levels
property_sub <- droplevels(property_sub)

Clean data

#== Cleaning ==#
table(property_sub$home.type)

## 
##                Condo/Coop Single Family Residential 
##                       552                       530 
##                 Townhouse 
##                       263

table(property_sub$zip_f)

## 
##  9112 98101 98102 98104 98109 98112 98119 98121 98122 98133 98136 98144 
##     1   162   164    34    51   247    45    40   484     1     1   115

table(property_sub$location)

## 
##          Arboretum          Broadmoor       Capitol Hill 
##                 16                 14                451 
##       Central Area       Denny Blaine           Downtown 
##                152                 16                105 
##         First Hill            Judkins             Leschi 
##                104                 34                 56 
##       Madison Park     Madison Valley            Madrona 
##                 19                 83                 71 
##           Montlake           Mt Baker      N Beacon Hill 
##                  3                  1                  1 
## North Capitol Hill         Queen Anne            Seattle 
##                 27                 90                 36 
##    Washington Park           Belltown           Broadway 
##                 22                  1                  1 
##     Denny Triangle   South Lake Union 
##                 33                  9

#remove records with <5 obs of zip/location for more stable inference
in_zip <- property_sub %>% count(zip_f) %>% filter(n>=5) %>% select(zip_f) %>% unlist
in_loc <- property_sub %>% count(location) %>% filter(n>=5) %>% select(location) %>% unlist
property_sub <- filter(property_sub,zip_f %in% in_zip,location %in% in_loc)

#check zip code
library(zipcode)
data(zipcode)

ref_zip = subset(zipcode, city=='Seattle'&state=='WA')
ref_zip = ref_zip[,c("zip","latitude","longitude")]
names(ref_zip) <- c('zip_f','ref_lati','ref_long')

zip_chk = merge(property_sub,ref_zip,by='zip_f')
zip_chk %>% filter(abs(latitude-ref_lati)>0.05 | abs(longitude-ref_long)>0.05)    #seems to be all okay

##  [1] zip_f           home.type       beds            baths          
##  [5] location        sqft            lot.size        year.built     
##  [9] parking.spots   parking.type    latitude        longitude      
## [13] last.sale.price ref_lati        ref_long       
## <0 rows> (or 0-length row.names)

#beds and baths
table(property_sub$beds)

## 
##   0   1   2   3   4   5   6   7   8 
##  53 331 349 350 178  64   6   3   1

table(property_sub$baths)

## 
## 0.75    1 1.25  1.5 1.75    2 2.25  2.5 2.75    3 3.25  3.5 3.75    4 4.25 
##   15  413   13  114  146  154   96  141   35   65   49   56   12    8    5 
##  4.5 4.75    5 5.75 
##    5    2    4    2

property_sub <- filter(property_sub,beds<=7,baths<=5)    #remove records with >7 bedrooms or >5 bathrooms 


#square feet
hist(property_sub$sqft)

range(property_sub$sqft,na.rm = T)

## [1]  316 8920

hist(log(property_sub$sqft))    #probably better to use log transformed sqft to decrease leverage of extreme values

#lot.size
hist(property_sub$lot.size)

range(property_sub$lot.size,na.rm = T)

## [1]      1 125361

property_sub$lot.size[property_sub$lot.size==1] <- NA    #set obs with lot.size==1 to be NA
hist(log(property_sub$lot.size))    #probably better to use log transformed lot.size

#year built
hist(property_sub$year.built)

#parking.spots
table(property_sub$parking.spots)

## 
##   0   1   2   3   4   5   6   8 
##  86 985 246   7   5   1   1   1

property_sub <- filter(property_sub,parking.spots<=4)    #remove records with >4 parking spots


#parking type
table(property_sub$parking.type)

## 
##        Garage 
##    468    861

levels(property_sub$parking.type)[1] <- 'other'    #non-garage parking


#create tranformed variables
property_sub <- property_sub %>%
    mutate(log_sqft = log(sqft),log_lotsize = log(lot.size)) %>%
    select(-longitude,-latitude)

#drop unused levels
property_sub <- droplevels(property_sub)

There might be useful demographic information from using Zillow API

#== Download demographic data from Zillow for each zipcode ==#
library(RCurl)

## Loading required package: bitops

library(XML)

zipcodes = levels(property_sub$zip_f)

#zid = "***********"    #my zillow ID

demo = data.frame()

for(z in zipcodes){
    reply = getForm("http://www.zillow.com/webservice/GetDemographics.htm",
                    'zws-id' = zid,
                    zip = z)
    node = xmlParse(reply)    
    root = xmlRoot(node)
    
    subnode = getNodeSet(root,'//page//table//data')
    demo_tmp = xmlToDataFrame(subnode[[length(subnode)]],stringsAsFactors = F)
    demo_tmp_df = as.data.frame(cbind(z,t(demo_tmp[2])))
    names(demo_tmp_df) = c("zip",demo_tmp[[1]])
    
    demo = rbind(demo,demo_tmp_df)
}

demo

##           zip Median Household Income      Single Males    Single Females
## values  98101        44512.0130806292 0.146462187349365 0.124578258618535
## values1 98102        44512.0130806292 0.146462187349365 0.124578258618535
## values2 98104        44512.0130806292 0.146462187349365 0.124578258618535
## values3 98109        44512.0130806292 0.146462187349365 0.124578258618535
## values4 98112        44512.0130806292 0.146462187349365 0.124578258618535
## values5 98119        44512.0130806292 0.146462187349365 0.124578258618535
## values6 98121        44512.0130806292 0.146462187349365 0.124578258618535
## values7 98122        44512.0130806292 0.146462187349365 0.124578258618535
## values8 98144        44512.0130806292 0.146462187349365 0.124578258618535
##         Median Age   Homes With Kids Average Household Size
## values          36 0.313623902816284       2.58883240001203
## values1         36 0.313623902816284       2.58883240001203
## values2         36 0.313623902816284       2.58883240001203
## values3         36 0.313623902816284       2.58883240001203
## values4         36 0.313623902816284       2.58883240001203
## values5         36 0.313623902816284       2.58883240001203
## values6         36 0.313623902816284       2.58883240001203
## values7         36 0.313623902816284       2.58883240001203
## values8         36 0.313623902816284       2.58883240001203
##         Average Commute Time (Minutes)
## values              26.375545725891282
## values1             26.375545725891282
## values2             26.375545725891282
## values3             26.375545725891282
## values4             26.375545725891282
## values5             26.375545725891282
## values6             26.375545725891282
## values7             26.375545725891282
## values8             26.375545725891282

#it turned out that the demographic statistics from Zillow are the same across different zipcodes
#therefore this information is not useful for my analysis

It turned out that the demographic statistics from Zillow are the same across different zipcodes, therefore this information is not useful for my analysis.

Exploratory analysis

#== Exploratory analysis ==#

pairs(property_sub)

Build prediction models

Split the data set into training set and testing set with a proportion of 80% vs 20%. Use 10-fold cross-validation to find the best model.

#== Build model ==#

set.seed(123)
n = dim(property_sub)[1]
inTrain = sample(1:n,size = round(0.8*n))

training = property_sub[inTrain,]
testing = property_sub[-inTrain,]

#check home type distribution
home_type_frac = table(property_sub$home.type)
home_type_frac/sum(home_type_frac)

## 
##                Condo/Coop Single Family Residential 
##                 0.4123401                 0.3905192 
##                 Townhouse 
##                 0.1971407

home_type_frac = table(training$home.type)
home_type_frac/sum(home_type_frac)    #approximately the same proportions

## 
##                Condo/Coop Single Family Residential 
##                 0.4158043                 0.3960489 
##                 Townhouse 
##                 0.1881468

#Normalize continuous variables. Note testing data use the same center and scale as training data.

train_scale = scale(training[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")])
training[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")] <- train_scale
y_train = scale(training$last.sale.price)

testing[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")] = 
    scale(testing[,c("beds","baths","sqft","lot.size","year.built","parking.spots","log_sqft","log_lotsize")],
               center = attr(train_scale,'scaled:center'),
               scale = attr(train_scale,'scaled:scale'))
y_test = scale(testing$last.sale.price,
               center = attr(y_train,'scaled:center'),
               scale = attr(y_train,'scaled:scale'))

Lasso regression

#try lasso
library(glmnet)

## Loading required package: Matrix
## Loading required package: foreach
## Loaded glmnet 2.0-2

x_train = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,training)
lambda_grids = 10^seq(-5,1,length.out = 100)

set.seed(999)
lasso_cv = cv.glmnet(x_train,y_train,nfolds = 10,alpha=1,lambda = lambda_grids)

plot(lasso_cv)

coef(lasso_cv,s='lambda.min',exact=TRUE)

## 18 x 1 sparse Matrix of class "dgCMatrix"
##                                                1
## (Intercept)                        -0.0875822645
## home.typeCondo/Coop                 0.5109954733
## home.typeSingle Family Residential  .           
## home.typeTownhouse                  .           
## zip_f98102                         -0.0855771713
## zip_f98104                         -0.1057240934
## zip_f98109                          0.1185561214
## zip_f98112                         -0.0036294879
## zip_f98119                          0.2651349126
## zip_f98121                         -0.1833625865
## zip_f98122                         -0.2387955922
## zip_f98144                         -0.3931313407
## beds                               -0.2917933892
## baths                               0.1891284207
## log_sqft                            0.9279420637
## year.built                         -0.0251512915
## parking.spots                       0.1058497304
## parking.typeGarage                  0.0009431958

min(lasso_cv$cvm)

## [1] 0.3544406

lasso_fit = glmnet(x_train,y_train,alpha = 1,lambda = lasso_cv$lambda.min)


x_test = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,testing)
y_pred = predict(lasso_fit,x_test)

mse_lasso = sum((y_pred-y_test)^2)/length(y_test)

plot(y_test,y_pred,xlab="True normalized price",ylab="Predicted normalized price",
     main="Prediction using Lasso regression")
text(-1,3,substitute(r^2 == r2,list(r2=cor(y_test,y_pred))),adj=0)
text(-1,2.7,substitute(MSE == r2,list(r2=mse_lasso)),adj=0)
abline(0,1)

There is apparent non-linear relationship between the predicted and true values. From the exploratory analysis there seems to be a non-linear relationship between the price and log square feet, try adding squared term for log_sqft.

x_train = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+I(log_sqft^2)+year.built+parking.spots+parking.type,training)
x_test = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+I(log_sqft^2)+year.built+parking.spots+parking.type,testing)

set.seed(999)
lasso_cv = cv.glmnet(x_train,y_train,nfolds = 10,alpha=1,lambda = lambda_grids)

plot(lasso_cv)

coef(lasso_cv,s='lambda.min',exact=TRUE)

## 19 x 1 sparse Matrix of class "dgCMatrix"
##                                              1
## (Intercept)                        -0.05864547
## home.typeCondo/Coop                 0.19941724
## home.typeSingle Family Residential -0.20579566
## home.typeTownhouse                  .         
## zip_f98102                         -0.25283819
## zip_f98104                         -0.16385243
## zip_f98109                          .         
## zip_f98112                         -0.20530237
## zip_f98119                          0.02084547
## zip_f98121                         -0.20497168
## zip_f98122                         -0.32454323
## zip_f98144                         -0.44575848
## beds                               -0.21567482
## baths                               0.01404911
## log_sqft                            0.98418457
## I(log_sqft^2)                       0.28439123
## year.built                          0.06267666
## parking.spots                       0.08060995
## parking.typeGarage                  0.01101055

min(lasso_cv$cvm)

## [1] 0.2807008

lasso_fit = glmnet(x_train,y_train,alpha = 1,lambda = lasso_cv$lambda.min)

y_pred = predict(lasso_fit,x_test)

mse_lasso = sum((y_pred-y_test)^2)/length(y_test)

plot(y_test,y_pred,xlab="True normalized price",ylab="Predicted normalized price",
     main="Figure 1. Prediction using Lasso regression")
mtext("add squared log_sqft", 3, line=0.5,cex=1.2)
text(-1,4,substitute(r^2 == r2,list(r2=cor(y_test,y_pred))),adj=0)
text(-1,3.7,substitute(MSE == r2,list(r2=mse_lasso)),adj=0)
abline(0,1)

Support Vector regression

Use RBF kernel to account for the non-linear relationship.

#try SVM regression (SVR)
library(e1071)
x_train = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,training)
x_test = model.matrix(~-1+home.type+zip_f+beds+baths+log_sqft+year.built+parking.spots+parking.type,testing)

set.seed(666)
svr_cv = tune(svm,x_train,y_train,kernel='radial',gamma=1,scale=F,type = 'eps-regression',cross=10,
              ranges = list(epsilon=c(0.05,0.1,0.2),cost=c(.5,1,2,5,10)))
svr_cv$best.parameters

##    epsilon cost
## 11     0.1    5

set.seed(555)    #fine tuning
svr_cv = tune(svm,x_train,y_train,kernel='radial',gamma=1,scale=F,type = 'eps-regression',cross=10,
              ranges = list(epsilon=c(0.08,0.1,0.12),cost=c(3,5,8)))
svr_cv$best.parameters

##   epsilon cost
## 4    0.08    5

y_pred_svr = predict(svr_cv$best.model,x_test)

mse_svr = sum((y_pred_svr-y_test)^2)/length(y_test)

plot(y_test,y_pred_svr,xlab="True normalized price",ylab="Predicted normalized price",
     main="Figure 2. Prediction using Support Vector regression")
text(-1,2.8,substitute(r^2 == r2,list(r2=cor(y_test,y_pred_svr))),adj=0)
text(-1,2.6,substitute(MSE == r2,list(r2=mse_svr)),adj=0)
abline(0,1)