Run a housing search from Zillow

Zillow is a web-based, leading real estate information service in the United States. I collect data from Zillow to analyze how to determine house prices using hedonic pricing model.

I choose 1+ bedrooms, any bathrooms, and only single-family houses in Duluth, GA. This is the url of my search below:

Zillow link

Scraping the results of my housing search

The results are spread on 6 pages, including 237 listings. However, listings are changed every second with new listings or removal. Also, a different location has a diffrent number of results and pages. Therefore, I create the general code that can apply to different locations and pages.

Four functions are used to collect data.

#Define functions to get price, details, address, types of transaction
get_price <- function(html){
  html %>%
    html_nodes('.list-card-price') %>%
    html_text() %>%
    str_trim()
}

get_details <- function(html){
  html %>%
    html_nodes('.list-card-details') %>%
    html_text() %>%
    str_trim()
}

get_address <- function(html){
  html %>%
    html_nodes('.list-card-addr') %>%
    html_text() %>%
    str_trim()
}

get_type<- function(html){
  html %>%
    html_nodes('.list-card-type') %>%
    html_text() %>%
    str_trim()
}

num_list <- 40 # Each page includes 40 listings. 
first_pg <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22con%22:{%22value%22:false},%22apa%22:{%22value%22:false},%22mf%22:{%22value%22:false},%22land%22:{%22value%22:false},%22tow%22:{%22value%22:false},%22manu%22:{%22value%22:false}},%22isListVisible%22:true}")

get_total_list <- function(html){
  html %>%
    html_nodes('.result-count') %>%
    html_text() %>%
    str_remove(' results') %>%
    str_trim()
}

total_list <- get_total_list(first_pg)
total_list <-as.numeric(str_replace_all(total_list,"[^0-9]*",""))
no_page <- ceiling(total_list / num_list) # Find the number of pages

price<-c()
details<-c()
address<-c()
type<-c()

#Scrape the search results from the first page to the last page 
for (i in 1:no_page){
  if (i==1){
    zillow_pages <- "https://www.zillow.com/duluth-ga/houses/1-_beds/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22con%22:{%22value%22:false},%22apa%22:{%22value%22:false},%22mf%22:{%22value%22:false},%22land%22:{%22value%22:false},%22tow%22:{%22value%22:false},%22manu%22:{%22value%22:false}},%22isListVisible%22:true}"
  }
  else{
    zillow_pages <- paste("https://www.zillow.com/duluth-ga/houses/1-_beds/",i,"_p/?searchQueryState={%22pagination%22:{%22currentPage%22:",i,"},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22con%22:{%22value%22:false},%22apa%22:{%22value%22:false},%22mf%22:{%22value%22:false},%22land%22:{%22value%22:false},%22tow%22:{%22value%22:false},%22manu%22:{%22value%22:false}},%22isListVisible%22:true}",sep = "")
  }
  
  zillow_data<- read_html(zillow_pages)
  price <- c(price, get_price(zillow_data))
  details<- c(details,get_details(zillow_data))
  address <- c(address,get_address(zillow_data))
  type <- c(type, get_type(zillow_data))
}

#Check the length of the results
length(price); length(details); length(address); length(type)

## [1] 228

## [1] 228

## [1] 228

## [1] 228

# Obtain the raw dataset using the fuctions aboves
zillow_df <- data.frame(matrix(nrow=total_list, ncol=0))

Conduct some data cleaning

zillow_df <- zillow_df %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bd)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
  mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price,"[^0-9]*",""))) 

zillow_df$ type <- type

head(zillow_df)

##   bedrooms bathrooms sqft  price           type
## 1        5         4 2733 355000 House for sale
## 2        4         4 3489 414900 House for sale
## 3        4         3 2686 278000 House for sale
## 4        5         5 5536 899900    Coming soon
## 5        3         3 2886 360000 House for sale
## 6        5         4 2700 365000 House for sale

#Create the new dataset w/o missing values
zillow_df <- na.omit(zillow_df)

nrow(zillow_df) # My final dataset has 228 observations.

## [1] 218

Visual analysis

Below boxplot detects the outliers in the distribution of house prices. There are outliers above the third quartiles of the distribution. I do not remove those outliers for this analysis.

boxplot(zillow_df$price,main="Boxplot of House Prices", 
        ylab = "House Price Listed ($)", yaxt = "n", 
        col = "yellow", medcol = "red", boxlty = 0, axes=FALSE,
        whisklty = 1,  staplelwd = 4, outpch = 8, outcex = 1)
axis(2, at = seq(0, max(zillow_df$price), 50000), las = 2, cex.axis=0.5)

This Scatter plot shows the Positive relationship between price and square footage. We also find that there are some outliers from the scatterplot.

g1<-ggplot(zillow_df, aes(y=price, x=sqft, color=as.factor(bedrooms))) +
  geom_point() +
  labs(y="Price", x="Square Footage",
  title="Scatter plot between price and square footage")

g1 + scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

Boxplot between price and type of transasction shows that the majority of types is House for sale

g2<-ggplot(zillow_df, aes(type, price))+
  geom_boxplot(outlier.color="red") +
  labs(x="Type", y="Price")

g2 + scale_y_continuous(labels = function(x) format(x, scientific = FALSE))

I create a dummy variable for foreclosure to examine the negative effect of foreclosure on house prices.

zillow_df$foreclosure <- as.character(zillow_df$type)
zillow_df$foreclosure[zillow_df$foreclosure == "House for sale"] <- 0
zillow_df$foreclosure[zillow_df$foreclosure == "For sale by owner"] <- 0
zillow_df$foreclosure[zillow_df$foreclosure == "New construction"] <- 0
zillow_df$foreclosure[zillow_df$foreclosure == "Pre-foreclosure / Auction"] <- 1
zillow_df$foreclosure <- as.numeric(zillow_df$foreclosure)

summary(zillow_df)

##     bedrooms       bathrooms           sqft           price        
##  Min.   :2.000   Min.   : 1.000   Min.   :  980   Min.   : 111499  
##  1st Qu.:3.000   1st Qu.: 3.000   1st Qu.: 2168   1st Qu.: 281400  
##  Median :4.000   Median : 4.000   Median : 2915   Median : 387426  
##  Mean   :4.289   Mean   : 4.216   Mean   : 3770   Mean   : 577322  
##  3rd Qu.:5.000   3rd Qu.: 5.000   3rd Qu.: 4436   3rd Qu.: 554045  
##  Max.   :7.000   Max.   :13.000   Max.   :13756   Max.   :4975000  
##                                                                    
##      type            foreclosure    
##  Length:218         Min.   :0.0000  
##  Class :character   1st Qu.:0.0000  
##  Mode  :character   Median :0.0000  
##                     Mean   :0.1944  
##                     3rd Qu.:0.0000  
##                     Max.   :1.0000  
##                     NA's   :2

Linear Regression Analysis

Hedonic pricing model: linear-linear model

model <- lm(price ~ sqft+bedrooms+bathrooms+foreclosure, zillow_df)
summary(model)

## 
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms + foreclosure, 
##     data = zillow_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1030037   -98433    -5269    85641  2214473 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  123460.57   94551.88   1.306 0.193061    
## sqft            195.26      19.33  10.104  < 2e-16 ***
## bedrooms    -153110.54   28787.56  -5.319 2.66e-07 ***
## bathrooms     90033.60   25958.12   3.468 0.000635 ***
## foreclosure  -28065.98   51450.63  -0.545 0.585990    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 288700 on 211 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.7747, Adjusted R-squared:  0.7704 
## F-statistic: 181.4 on 4 and 211 DF,  p-value: < 2.2e-16

model$coefficients

##  (Intercept)         sqft     bedrooms    bathrooms  foreclosure 
##  123460.5718     195.2618 -153110.5385   90033.6045  -28065.9841

Goodness of Fit

R-squared is 0.794 (79.4%). It means that the 79.4% of the variation in house prices is explained by the independent variables in the model.

Interpretation

“sqft”, “bedrooms”, and “bathrooms” are statistically significant at 1% level. However, “bedrooms” has a negative sign which I did not expect to have because I control for square footage. More rooms dividing up the same sapce is not necessarily more valuable here. “foreclosure” has a negative sign which I expected to have, but it is not statistically significant.

On average, an additional bedroom in a house decreases the price of house by $148,733.

On average, an additional bathroom in a house increases the price of house by $84,472.

On average, one unit increase in sqft increases the price of house by $199.

Hedonic pricing model - log-linear model Hedonic pricing model often uses log-linear model because of convienent interpretation.

logmodel <- lm(log(price)~ sqft+bedrooms+bathrooms+foreclosure, zillow_df)
summary(logmodel)

## 
## Call:
## lm(formula = log(price) ~ sqft + bedrooms + bathrooms + foreclosure, 
##     data = zillow_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.93476 -0.11633 -0.01547  0.11389  0.83169 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.198e+01  6.740e-02 177.820  < 2e-16 ***
## sqft         1.515e-04  1.378e-05  10.999  < 2e-16 ***
## bedrooms    -1.289e-02  2.052e-02  -0.628     0.53    
## bathrooms    1.250e-01  1.850e-02   6.754 1.37e-10 ***
## foreclosure -1.478e-01  3.667e-02  -4.031 7.74e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2058 on 211 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.9009, Adjusted R-squared:  0.899 
## F-statistic: 479.6 on 4 and 211 DF,  p-value: < 2.2e-16

logmodel$coefficients

##  (Intercept)         sqft     bedrooms    bathrooms  foreclosure 
## 11.984413334  0.000151521 -0.012893505  0.124973804 -0.147848652

Goodness of Fit

R-squared for log-linear model is 91.1% which is much better than linear model. It is because “price” variable becomes normal with transformation.

Ridge Regression Analysis

library(glmnet)
zillow_df <- na.omit(zillow_df)

#Find the lambda value
x_vars <- as.matrix(zillow_df[-c(4,5)])
y_var <- zillow_df$price
lambda_seq <- 10^seq(2, -2, by = -.1)

#Ridge regression - coefficient
ridge_reg <- cv.glmnet(x_vars, y_var, alpha = 0, lambda = lambda_seq)

optimal_lambda <- ridge_reg$lambda.min
optimal_lambda

## [1] 0.01

ridge_best <- glmnet(x_vars, y_var, alpha = 0, lambda = optimal_lambda)
coef(ridge_best)

## 5 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept)  123548.8246
## bedrooms    -153285.3820
## bathrooms     90298.7733
## sqft            195.1415
## foreclosure  -28084.1402

#Ridge regression - r-squared
ridge_pred <- predict(ridge_best, s = optimal_lambda, newx = x_vars)
ridge_final <- cbind(y_var, ridge_pred)
colnames(ridge_final) <- c("actual", "preds")

ridge_final <-data.frame(ridge_final)

ridge_rss <- sum((ridge_final$preds - ridge_final$actual) ^ 2)
ridge_tss <- sum((ridge_final$actual - mean(ridge_final$actual)) ^ 2)
ridge_rsq <- 1 - ridge_rss/ridge_tss
ridge_rsq

## [1] 0.7746695

Lasso Regression Analysis

lasso_reg <- cv.glmnet(x_vars, y_var, alpha = 1, lambda = lambda_seq)

optimal_lambda <- ridge_reg$lambda.min
optimal_lambda

## [1] 0.01

#Lasso regression - coefficient
lasso_best <- glmnet(x_vars, y_var, alpha = 1, lambda = optimal_lambda)
coef(lasso_best)

## 5 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept)  123548.7726
## bedrooms    -153285.3539
## bathrooms     90298.7501
## sqft            195.1415
## foreclosure  -28084.1105

#Lasso regression - r-squared
lasso_pred <- predict(lasso_best, s = optimal_lambda, newx = x_vars)
lasso_final <- cbind(y_var, lasso_pred)
colnames(lasso_final) <- c("actual", "preds")

lasso_final <-data.frame(lasso_final)

lasso_rss <- sum((lasso_final$preds - lasso_final$actual) ^ 2)
lasso_tss <- sum((lasso_final$actual - mean(lasso_final$actual)) ^ 2)
lasso_rsq <- 1 - lasso_rss/lasso_tss
lasso_rsq

## [1] 0.7746695

he<-data.frame(model$coefficients)

h_intercept<-(he)[1,]
h_bedrooms <-(he)[3,]
h_bathrooms <-(he)[4,]
h_sqft <-(he)[2,]
h_foreclosure <-(he)[5,]

r_intercept<-coef(ridge_best)[1,]
r_bedrooms <-coef(ridge_best)[2,]
r_bathrooms <-coef(ridge_best)[3,]
r_sqft <-coef(ridge_best)[4,]
r_foreclosure <-coef(ridge_best)[5,]

l_intercept<-coef(lasso_best)[1,]
l_bedrooms <-coef(lasso_best)[2,]
l_bathrooms <-coef(lasso_best)[3,]
l_sqft <-coef(lasso_best)[4,]
l_foreclosure <-coef(lasso_best)[5,]

hedonic <- c(h_intercept,h_sqft,h_bedrooms,h_bathrooms,h_foreclosure)
ridge <- c(r_intercept,r_sqft,r_bedrooms,r_bathrooms,r_foreclosure)
lasso <- c(l_intercept,l_sqft,l_bedrooms,l_bathrooms,l_foreclosure)

coef <- rbind(hedonic, ridge, lasso)
colnames(coef) <- c("Intercept","sqft","Bedrooms","Bathroom","Foreclosure")
coef

##         Intercept     sqft  Bedrooms Bathroom Foreclosure
## hedonic  123460.6 195.2618 -153110.5 90033.60   -28065.98
## ridge    123548.8 195.1415 -153285.4 90298.77   -28084.14
## lasso    123548.8 195.1415 -153285.4 90298.75   -28084.11

The coefficient from three models are almost same each other. R-squareds from three models are consistent.

Section 1

Yoo