Zillow is a web-based, leading real estate information service in the United States. I collect data from Zillow to analyze how to determine house prices using hedonic pricing model.
I choose 1+ bedrooms, any bathrooms, and only single-family houses in Duluth, GA. This is the url of my search below:
The results are spread on 6 pages, including 237 listings. However, listings are changed every second with new listings or removal. Also, a different location has a diffrent number of results and pages. Therefore, I create the general code that can apply to different locations and pages.
Four functions are used to collect data.
#Define functions to get price, details, address, types of transaction
get_price <- function(html){
html %>%
html_nodes('.list-card-price') %>%
html_text() %>%
str_trim()
}
get_details <- function(html){
html %>%
html_nodes('.list-card-details') %>%
html_text() %>%
str_trim()
}
get_address <- function(html){
html %>%
html_nodes('.list-card-addr') %>%
html_text() %>%
str_trim()
}
get_type<- function(html){
html %>%
html_nodes('.list-card-type') %>%
html_text() %>%
str_trim()
}
num_list <- 40 # Each page includes 40 listings.
first_pg <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22con%22:{%22value%22:false},%22apa%22:{%22value%22:false},%22mf%22:{%22value%22:false},%22land%22:{%22value%22:false},%22tow%22:{%22value%22:false},%22manu%22:{%22value%22:false}},%22isListVisible%22:true}")
get_total_list <- function(html){
html %>%
html_nodes('.result-count') %>%
html_text() %>%
str_remove(' results') %>%
str_trim()
}
total_list <- get_total_list(first_pg)
total_list <-as.numeric(str_replace_all(total_list,"[^0-9]*",""))
no_page <- ceiling(total_list / num_list) # Find the number of pages
price<-c()
details<-c()
address<-c()
type<-c()
#Scrape the search results from the first page to the last page
for (i in 1:no_page){
if (i==1){
zillow_pages <- "https://www.zillow.com/duluth-ga/houses/1-_beds/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22con%22:{%22value%22:false},%22apa%22:{%22value%22:false},%22mf%22:{%22value%22:false},%22land%22:{%22value%22:false},%22tow%22:{%22value%22:false},%22manu%22:{%22value%22:false}},%22isListVisible%22:true}"
}
else{
zillow_pages <- paste("https://www.zillow.com/duluth-ga/houses/1-_beds/",i,"_p/?searchQueryState={%22pagination%22:{%22currentPage%22:",i,"},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22con%22:{%22value%22:false},%22apa%22:{%22value%22:false},%22mf%22:{%22value%22:false},%22land%22:{%22value%22:false},%22tow%22:{%22value%22:false},%22manu%22:{%22value%22:false}},%22isListVisible%22:true}",sep = "")
}
zillow_data<- read_html(zillow_pages)
price <- c(price, get_price(zillow_data))
details<- c(details,get_details(zillow_data))
address <- c(address,get_address(zillow_data))
type <- c(type, get_type(zillow_data))
}
#Check the length of the results
length(price); length(details); length(address); length(type)
## [1] 228
## [1] 228
## [1] 228
## [1] 228
# Obtain the raw dataset using the fuctions aboves
zillow_df <- data.frame(matrix(nrow=total_list, ncol=0))
zillow_df <- zillow_df %>%
mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bd)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price,"[^0-9]*","")))
zillow_df$ type <- type
head(zillow_df)
## bedrooms bathrooms sqft price type
## 1 5 4 2733 355000 House for sale
## 2 4 4 3489 414900 House for sale
## 3 4 3 2686 278000 House for sale
## 4 5 5 5536 899900 Coming soon
## 5 3 3 2886 360000 House for sale
## 6 5 4 2700 365000 House for sale
#Create the new dataset w/o missing values
zillow_df <- na.omit(zillow_df)
nrow(zillow_df) # My final dataset has 228 observations.
## [1] 218
Below boxplot detects the outliers in the distribution of house prices. There are outliers above the third quartiles of the distribution. I do not remove those outliers for this analysis.
boxplot(zillow_df$price,main="Boxplot of House Prices",
ylab = "House Price Listed ($)", yaxt = "n",
col = "yellow", medcol = "red", boxlty = 0, axes=FALSE,
whisklty = 1, staplelwd = 4, outpch = 8, outcex = 1)
axis(2, at = seq(0, max(zillow_df$price), 50000), las = 2, cex.axis=0.5)
This Scatter plot shows the Positive relationship between price and square footage. We also find that there are some outliers from the scatterplot.
g1<-ggplot(zillow_df, aes(y=price, x=sqft, color=as.factor(bedrooms))) +
geom_point() +
labs(y="Price", x="Square Footage",
title="Scatter plot between price and square footage")
g1 + scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
Boxplot between price and type of transasction shows that the majority of types is House for sale
g2<-ggplot(zillow_df, aes(type, price))+
geom_boxplot(outlier.color="red") +
labs(x="Type", y="Price")
g2 + scale_y_continuous(labels = function(x) format(x, scientific = FALSE))
I create a dummy variable for foreclosure to examine the negative effect of foreclosure on house prices.
zillow_df$foreclosure <- as.character(zillow_df$type)
zillow_df$foreclosure[zillow_df$foreclosure == "House for sale"] <- 0
zillow_df$foreclosure[zillow_df$foreclosure == "For sale by owner"] <- 0
zillow_df$foreclosure[zillow_df$foreclosure == "New construction"] <- 0
zillow_df$foreclosure[zillow_df$foreclosure == "Pre-foreclosure / Auction"] <- 1
zillow_df$foreclosure <- as.numeric(zillow_df$foreclosure)
summary(zillow_df)
## bedrooms bathrooms sqft price
## Min. :2.000 Min. : 1.000 Min. : 980 Min. : 111499
## 1st Qu.:3.000 1st Qu.: 3.000 1st Qu.: 2168 1st Qu.: 281400
## Median :4.000 Median : 4.000 Median : 2915 Median : 387426
## Mean :4.289 Mean : 4.216 Mean : 3770 Mean : 577322
## 3rd Qu.:5.000 3rd Qu.: 5.000 3rd Qu.: 4436 3rd Qu.: 554045
## Max. :7.000 Max. :13.000 Max. :13756 Max. :4975000
##
## type foreclosure
## Length:218 Min. :0.0000
## Class :character 1st Qu.:0.0000
## Mode :character Median :0.0000
## Mean :0.1944
## 3rd Qu.:0.0000
## Max. :1.0000
## NA's :2
Hedonic pricing model: linear-linear model
model <- lm(price ~ sqft+bedrooms+bathrooms+foreclosure, zillow_df)
summary(model)
##
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms + foreclosure,
## data = zillow_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1030037 -98433 -5269 85641 2214473
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 123460.57 94551.88 1.306 0.193061
## sqft 195.26 19.33 10.104 < 2e-16 ***
## bedrooms -153110.54 28787.56 -5.319 2.66e-07 ***
## bathrooms 90033.60 25958.12 3.468 0.000635 ***
## foreclosure -28065.98 51450.63 -0.545 0.585990
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 288700 on 211 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.7747, Adjusted R-squared: 0.7704
## F-statistic: 181.4 on 4 and 211 DF, p-value: < 2.2e-16
model$coefficients
## (Intercept) sqft bedrooms bathrooms foreclosure
## 123460.5718 195.2618 -153110.5385 90033.6045 -28065.9841
R-squared is 0.794 (79.4%). It means that the 79.4% of the variation in house prices is explained by the independent variables in the model.
“sqft”, “bedrooms”, and “bathrooms” are statistically significant at 1% level. However, “bedrooms” has a negative sign which I did not expect to have because I control for square footage. More rooms dividing up the same sapce is not necessarily more valuable here. “foreclosure” has a negative sign which I expected to have, but it is not statistically significant.
On average, an additional bedroom in a house decreases the price of house by $148,733.
On average, an additional bathroom in a house increases the price of house by $84,472.
On average, one unit increase in sqft increases the price of house by $199.
Hedonic pricing model - log-linear model Hedonic pricing model often uses log-linear model because of convienent interpretation.
logmodel <- lm(log(price)~ sqft+bedrooms+bathrooms+foreclosure, zillow_df)
summary(logmodel)
##
## Call:
## lm(formula = log(price) ~ sqft + bedrooms + bathrooms + foreclosure,
## data = zillow_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.93476 -0.11633 -0.01547 0.11389 0.83169
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.198e+01 6.740e-02 177.820 < 2e-16 ***
## sqft 1.515e-04 1.378e-05 10.999 < 2e-16 ***
## bedrooms -1.289e-02 2.052e-02 -0.628 0.53
## bathrooms 1.250e-01 1.850e-02 6.754 1.37e-10 ***
## foreclosure -1.478e-01 3.667e-02 -4.031 7.74e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2058 on 211 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.9009, Adjusted R-squared: 0.899
## F-statistic: 479.6 on 4 and 211 DF, p-value: < 2.2e-16
logmodel$coefficients
## (Intercept) sqft bedrooms bathrooms foreclosure
## 11.984413334 0.000151521 -0.012893505 0.124973804 -0.147848652
R-squared for log-linear model is 91.1% which is much better than linear model. It is because “price” variable becomes normal with transformation.
library(glmnet)
zillow_df <- na.omit(zillow_df)
#Find the lambda value
x_vars <- as.matrix(zillow_df[-c(4,5)])
y_var <- zillow_df$price
lambda_seq <- 10^seq(2, -2, by = -.1)
#Ridge regression - coefficient
ridge_reg <- cv.glmnet(x_vars, y_var, alpha = 0, lambda = lambda_seq)
optimal_lambda <- ridge_reg$lambda.min
optimal_lambda
## [1] 0.01
ridge_best <- glmnet(x_vars, y_var, alpha = 0, lambda = optimal_lambda)
coef(ridge_best)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 123548.8246
## bedrooms -153285.3820
## bathrooms 90298.7733
## sqft 195.1415
## foreclosure -28084.1402
#Ridge regression - r-squared
ridge_pred <- predict(ridge_best, s = optimal_lambda, newx = x_vars)
ridge_final <- cbind(y_var, ridge_pred)
colnames(ridge_final) <- c("actual", "preds")
ridge_final <-data.frame(ridge_final)
ridge_rss <- sum((ridge_final$preds - ridge_final$actual) ^ 2)
ridge_tss <- sum((ridge_final$actual - mean(ridge_final$actual)) ^ 2)
ridge_rsq <- 1 - ridge_rss/ridge_tss
ridge_rsq
## [1] 0.7746695
lasso_reg <- cv.glmnet(x_vars, y_var, alpha = 1, lambda = lambda_seq)
optimal_lambda <- ridge_reg$lambda.min
optimal_lambda
## [1] 0.01
#Lasso regression - coefficient
lasso_best <- glmnet(x_vars, y_var, alpha = 1, lambda = optimal_lambda)
coef(lasso_best)
## 5 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) 123548.7726
## bedrooms -153285.3539
## bathrooms 90298.7501
## sqft 195.1415
## foreclosure -28084.1105
#Lasso regression - r-squared
lasso_pred <- predict(lasso_best, s = optimal_lambda, newx = x_vars)
lasso_final <- cbind(y_var, lasso_pred)
colnames(lasso_final) <- c("actual", "preds")
lasso_final <-data.frame(lasso_final)
lasso_rss <- sum((lasso_final$preds - lasso_final$actual) ^ 2)
lasso_tss <- sum((lasso_final$actual - mean(lasso_final$actual)) ^ 2)
lasso_rsq <- 1 - lasso_rss/lasso_tss
lasso_rsq
## [1] 0.7746695
he<-data.frame(model$coefficients)
h_intercept<-(he)[1,]
h_bedrooms <-(he)[3,]
h_bathrooms <-(he)[4,]
h_sqft <-(he)[2,]
h_foreclosure <-(he)[5,]
r_intercept<-coef(ridge_best)[1,]
r_bedrooms <-coef(ridge_best)[2,]
r_bathrooms <-coef(ridge_best)[3,]
r_sqft <-coef(ridge_best)[4,]
r_foreclosure <-coef(ridge_best)[5,]
l_intercept<-coef(lasso_best)[1,]
l_bedrooms <-coef(lasso_best)[2,]
l_bathrooms <-coef(lasso_best)[3,]
l_sqft <-coef(lasso_best)[4,]
l_foreclosure <-coef(lasso_best)[5,]
hedonic <- c(h_intercept,h_sqft,h_bedrooms,h_bathrooms,h_foreclosure)
ridge <- c(r_intercept,r_sqft,r_bedrooms,r_bathrooms,r_foreclosure)
lasso <- c(l_intercept,l_sqft,l_bedrooms,l_bathrooms,l_foreclosure)
coef <- rbind(hedonic, ridge, lasso)
colnames(coef) <- c("Intercept","sqft","Bedrooms","Bathroom","Foreclosure")
coef
## Intercept sqft Bedrooms Bathroom Foreclosure
## hedonic 123460.6 195.2618 -153110.5 90033.60 -28065.98
## ridge 123548.8 195.1415 -153285.4 90298.77 -28084.14
## lasso 123548.8 195.1415 -153285.4 90298.75 -28084.11
The coefficient from three models are almost same each other. R-squareds from three models are consistent.