Run a housing search from Zillow

Zillow is a web-based, leading real estate information service in the United States. We collect data from Zillow to analyze how to determine house prices using hedonic pricing model.

I choose 1+ bedrooms, any bathrooms, and only single-family houses in Duluth, GA. This is the url of my search below:

Zillow link

library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
library(tidyverse)

First page

zillow1 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")

price1 <- get_price(zillow1)
detail1 <- get_detail(zillow1)
address1 <- get_address(zillow1)
type1 <- get_type(zillow1)

length(price1); length(detail1); length(address1); length(type1)
## [1] 40
## [1] 40
## [1] 40
## [1] 40

Second page

zillow2 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/2_p/?searchQueryState={%22pagination%22:{%22currentPage%22:2},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")

price2 <- get_price(zillow2)
detail2 <- get_detail(zillow2)
address2 <- get_address(zillow2)
type2 <- get_type(zillow2)

length(price2); length(detail2); length(address2); length(type2)
## [1] 40
## [1] 40
## [1] 40
## [1] 40

Thrid page

zillow3 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/3_p/?searchQueryState={%22pagination%22:{%22currentPage%22:3},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")

price3 <- get_price(zillow3)
detail3 <- get_detail(zillow3)
address3 <- get_address(zillow3)
type3 <- get_type(zillow3)

length(price3); length(detail3); length(address3); length(type3)
## [1] 40
## [1] 40
## [1] 40
## [1] 40

Fourth page

zillow4 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/4_p/?searchQueryState={%22pagination%22:{%22currentPage%22:4},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")

price4 <- get_price(zillow4)
detail4 <- get_detail(zillow4)
address4 <- get_address(zillow4)
type4 <- get_type(zillow4)

length(price4); length(detail4); length(address4); length(type4)
## [1] 40
## [1] 40
## [1] 40
## [1] 40

Fifth page

zillow5 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/5_p/?searchQueryState={%22pagination%22:{%22currentPage%22:5},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")

price5 <- get_price(zillow5)
detail5 <- get_detail(zillow5)
address5 <- get_address(zillow5)
type5 <- get_type(zillow5)

length(price5); length(detail5); length(address5); length(type5)
## [1] 40
## [1] 40
## [1] 40
## [1] 40

Sixth page

zillow6 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/6_p/?searchQueryState={%22pagination%22:{%22currentPage%22:6},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")

price6 <- get_price(zillow6)
detail6 <- get_detail(zillow6)
address6 <- get_address(zillow6)
type6 <- get_type(zillow6)

length(price6); length(detail6); length(address6); length(type6)
## [1] 34
## [1] 34
## [1] 34
## [1] 34

Do some cleaning

Before merging all 6 pages, I did cut strings in the detail column for each page.

zillow1 <- data_frame(price1,detail1, address1, type1)

zillow1 <- zillow1 %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(detail1, "\\d+(?= bds?)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(detail1, "\\d+(?= ba)")))) %>%
  mutate(sqft = str_trim(str_extract(detail1, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price1,"[^0-9]*",""))) 
  
zillow1 <- zillow1 %>%
  rename(
    address = address1,
    type = type1
  )
 
zillow1 <- zillow1 %>%
  select(price, sqft,bedrooms, bathrooms, address, type)


head(zillow1)
## # A tibble: 6 x 6
##    price  sqft bedrooms bathrooms address                      type             
##    <dbl> <dbl>    <int>     <int> <chr>                        <chr>            
## 1 279271  4900        7         5 2153 Pond Rd, Duluth, GA 30~ Pre-foreclosure ~
## 2 260000  1796        3         3 2902 Gravitt Trl, Duluth, G~ House for sale   
## 3 220000  1698        4         3 4405 Hopkins Lake Dr, Dulut~ Coming soon      
## 4 310000  2160        4         3 2356 Longlake Way, Duluth, ~ House for sale   
## 5 210000  1123        2         2 2875 Barnwood Xing, Duluth,~ House for sale   
## 6     NA  2440        3         2 2325 Oak Glenn Cir, Duluth,~ Auction
zillow2 <- data_frame(price2,detail2, address2, type2)

zillow2 <- zillow2 %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(detail2, "\\d+(?= bds?)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(detail2, "\\d+(?= ba)")))) %>%
  mutate(sqft = str_trim(str_extract(detail2, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price2,"[^0-9]*",""))) 

zillow2 <- zillow2 %>%
  rename(
    address = address2,
    type = type2
  )
 
zillow2 <- zillow2 %>%
  select(price, sqft, bedrooms, bathrooms, address, type)

head(zillow2)
## # A tibble: 6 x 6
##    price  sqft bedrooms bathrooms address                          type         
##    <dbl> <dbl>    <int>     <int> <chr>                            <chr>        
## 1 619000  4536        4         6 3806 Turnberry Ct, Duluth, GA 3~ House for sa~
## 2 425000  3867        5         5 1905 Noblin Ridge Trl, Duluth, ~ House for sa~
## 3 788425  5115        5         5 8930 Moor Park Run, Duluth, GA ~ Pre-foreclos~
## 4 494900  3106        4         4 Everglade W/Basement Plan, Sout~ New construc~
## 5 475000  4445        6         5 2719 Cedar Kay Trl, Duluth, GA ~ House for sa~
## 6 409000  3525        4         4 1668 Westvale Pl, Duluth, GA 30~ House for sa~
zillow3 <- data_frame(price3,detail3, address3, type3)

zillow3 <- zillow3 %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(detail3, "\\d+(?= bds?)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(detail3, "\\d+(?= ba)")))) %>%
  mutate(sqft = str_trim(str_extract(detail3, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price3,"[^0-9]*",""))) 

zillow3 <- zillow3 %>%
  rename(
    address = address3,
    type = type3
  )
 
zillow3 <- zillow3 %>%
  select(price, sqft, bedrooms, bathrooms, address, type)

head(zillow3)
## # A tibble: 6 x 6
##     price  sqft bedrooms bathrooms address                          type        
##     <dbl> <dbl>    <int>     <int> <chr>                            <chr>       
## 1  670000  6010        6         7 7950 Chancery Rdg, Duluth, GA 3~ House for s~
## 2  350000  3028        4         3 4880 Racquet Ct, Duluth, GA 300~ House for s~
## 3 1125000  7689        6         8 2837 Major Ridge Trl, Duluth, G~ House for s~
## 4 1469000  9248        6         8 2942 Darlington Run, Duluth, GA~ House for s~
## 5  449000  3324        5         5 2540 Northmont Pkwy, Duluth, GA~ House for s~
## 6  925000  6056        5         5 8980 Moor Park Run, Duluth, GA ~ House for s~
zillow4 <- data_frame(price4,detail4, address4, type4)

zillow4 <- zillow4 %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(detail4, "\\d+(?= bds?)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(detail4, "\\d+(?= ba)")))) %>%
  mutate(sqft = str_trim(str_extract(detail4, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price4,"[^0-9]*",""))) 

zillow4 <- zillow4 %>%
  rename(
    address = address4,
    type = type4
  )
 
zillow4 <- zillow4 %>%
  select(price, sqft, bedrooms, bathrooms, address, type)

head(zillow4)
## # A tibble: 6 x 6
##     price  sqft bedrooms bathrooms address                     type             
##     <dbl> <dbl>    <int>     <int> <chr>                       <chr>            
## 1 1190000  5599        5         7 3299 Carmichael Pl, Duluth~ House for sale   
## 2  569900  3619        4         5 601 Astley Dr, Johns Creek~ New construction 
## 3 1099000  7085        5         7 1855 Sugarloaf Club Dr, Du~ House for sale   
## 4  450290  3337        4         4 4492 Claiborne Ct, Duluth,~ New construction 
## 5  466020  3337        4         4 4491 Claiborne Ct, Duluth,~ New construction 
## 6  340957  2658        4         3 3165 Oak Hampton Way, Dulu~ Pre-foreclosure ~
zillow5 <- data_frame(price5,detail5, address5, type5)

zillow5 <- zillow5 %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(detail5, "\\d+(?= bds?)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(detail5, "\\d+(?= ba)")))) %>%
  mutate(sqft = str_trim(str_extract(detail5, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price5,"[^0-9]*",""))) 

zillow5 <- zillow5 %>%
  rename(
    address = address5,
    type = type5
  )
 
zillow5 <- zillow5 %>%
  select(price, sqft, bedrooms, bathrooms, address, type)

head(zillow5)
## # A tibble: 6 x 6
##     price  sqft bedrooms bathrooms address                     type             
##     <dbl> <dbl>    <int>     <int> <chr>                       <chr>            
## 1  304900  1636        2         2 2259 Alnwick Dr, Duluth, G~ House for sale   
## 2  254798  2827        4         3 4424 Old Norcross Rd, Dulu~ Pre-foreclosure ~
## 3 3990000 13756        6         8 3977 Sweet Bottom Dr, Dulu~ House for sale   
## 4  439900  3330        4         4 4542 Claiborne Ct, Duluth,~ New construction 
## 5  199961  1696        3         2 2075 Executive Dr, Duluth,~ Pre-foreclosure ~
## 6  253559  2200        4         3 2880 Gravitt Rd, Duluth, G~ Pre-foreclosure ~
zillow6 <- data_frame(price6,detail6, address6, type6)

zillow6 <- zillow6 %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(detail6, "\\d+(?= bds?)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(detail6, "\\d+(?= ba)")))) %>%
  mutate(sqft = str_trim(str_extract(detail6, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(price6,"[^0-9]*",""))) 

zillow6 <- zillow6 %>%
  rename(
    address = address6,
    type = type6
  )
 
zillow6 <- zillow6 %>%
  select(price, sqft, bedrooms, bathrooms, address, type)
head(zillow6)
## # A tibble: 6 x 6
##     price  sqft bedrooms bathrooms address                     type             
##     <dbl> <dbl>    <int>     <int> <chr>                       <chr>            
## 1  222500  1320        3         2 2606 Meadow Ridge Dr, Dulu~ House for sale   
## 2 1274800  7431        7         8 8765 Colonial Pl, Duluth, ~ House for sale   
## 3 1200000  8397        6         8 8790 Colonial Pl, Duluth, ~ House for sale   
## 4  775000  5979        6         7 3824 Saint Annes Ct, Dulut~ House for sale   
## 5  260519  2054        3         3 2805 Shelter Cv, Duluth, G~ Pre-foreclosure ~
## 6  268374  1970        3         3 3120 Bugle Dr, Duluth, GA ~ Pre-foreclosure ~

This is the final data set.

zillow_total <- rbind(zillow1, zillow2, zillow3, zillow4, zillow5, zillow6)
head(zillow_total)
## # A tibble: 6 x 6
##    price  sqft bedrooms bathrooms address                      type             
##    <dbl> <dbl>    <int>     <int> <chr>                        <chr>            
## 1 279271  4900        7         5 2153 Pond Rd, Duluth, GA 30~ Pre-foreclosure ~
## 2 260000  1796        3         3 2902 Gravitt Trl, Duluth, G~ House for sale   
## 3 220000  1698        4         3 4405 Hopkins Lake Dr, Dulut~ Coming soon      
## 4 310000  2160        4         3 2356 Longlake Way, Duluth, ~ House for sale   
## 5 210000  1123        2         2 2875 Barnwood Xing, Duluth,~ House for sale   
## 6     NA  2440        3         2 2325 Oak Glenn Cir, Duluth,~ Auction

Visual analysis

ggplot(zillow_total, aes(y=price, x=sqft)) +
  geom_point(aes(color=as.factor(bedrooms))) +
  labs(y="Price", x="Square Footage") +
ggtitle("Scatter plot betwwen price and square footage")

We find that there are some outliers from the scatterplot.

ggplot(zillow_total, aes(type, price))+
  geom_boxplot(outlier.color="red") +
  labs(x="Type", y="Price")

I create a dummy variable for foreclosure to examine the negative effect of foreclosure on house prices.

zillow_total$foreclosure <- as.character(zillow_total$type)
zillow_total$foreclosure[zillow_total$foreclosure == "House for sale"] <- 0
zillow_total$foreclosure[zillow_total$foreclosure == "For sale by owner"] <- 0
zillow_total$foreclosure[zillow_total$foreclosure == "New construction"] <- 0
zillow_total$foreclosure[zillow_total$foreclosure == "Pre-foreclosure / Auction"] <- 1
zillow_total$foreclosure <- as.numeric(zillow_total$foreclosure)

summary(zillow_total)
##      price              sqft          bedrooms       bathrooms     
##  Min.   : 109105   Min.   :  980   Min.   :2.000   Min.   : 1.000  
##  1st Qu.: 285617   1st Qu.: 2157   1st Qu.:3.000   1st Qu.: 3.000  
##  Median : 417461   Median : 2985   Median :4.000   Median : 4.000  
##  Mean   : 627671   Mean   : 3946   Mean   :4.342   Mean   : 4.393  
##  3rd Qu.: 775000   3rd Qu.: 5008   3rd Qu.:5.000   3rd Qu.: 5.000  
##  Max.   :4975000   Max.   :13756   Max.   :7.000   Max.   :13.000  
##  NA's   :4         NA's   :6                       NA's   :5       
##    address              type            foreclosure    
##  Length:234         Length:234         Min.   :0.0000  
##  Class :character   Class :character   1st Qu.:0.0000  
##  Mode  :character   Mode  :character   Median :0.0000  
##                                        Mean   :0.2087  
##                                        3rd Qu.:0.0000  
##                                        Max.   :1.0000  
##                                        NA's   :4

Regression Analysis

model <- lm(price ~ sqft+bedrooms+bathrooms+foreclosure, zillow_total)
model$coefficients
##  (Intercept)         sqft     bedrooms    bathrooms  foreclosure 
##  127112.0884     198.1076 -156054.7592   87918.3795  -41347.2539
summary(model)
## 
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms + foreclosure, 
##     data = zillow_total)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -718028 -126741   -5845   93073 2221934 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  127112.09   97532.16   1.303  0.19388    
## sqft            198.11      18.79  10.540  < 2e-16 ***
## bedrooms    -156054.76   29207.57  -5.343 2.34e-07 ***
## bathrooms     87918.38   24910.42   3.529  0.00051 ***
## foreclosure  -41347.25   52979.57  -0.780  0.43600    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 291300 on 213 degrees of freedom
##   (16 observations deleted due to missingness)
## Multiple R-squared:  0.789,  Adjusted R-squared:  0.785 
## F-statistic: 199.1 on 4 and 213 DF,  p-value: < 2.2e-16

“sqft”, “bedrooms”, and “bathrooms” are statistically significant at 1% level. However, “bedrooms” has a negative sign which I did not expect to have. “foreclosure” has a negative sign which I expected to have, but it is not statistically significant. So, I remove some outliers on price above 75th percential.

zillow75 <- filter(zillow_total, price<758751)

model_1 <- lm(price ~ sqft+bedrooms+bathrooms+foreclosure, zillow75)
model_1$coefficients
##  (Intercept)         sqft     bedrooms    bathrooms  foreclosure 
## 100440.81409     77.72874  -4006.80726  21560.26174 -54169.13177
summary(model_1)
## 
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms + foreclosure, 
##     data = zillow75)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -227625  -32652   -3611   28336  214538 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 100440.814  23135.273   4.341 2.52e-05 ***
## sqft            77.729      8.244   9.428  < 2e-16 ***
## bedrooms     -4006.807   8081.041  -0.496   0.6207    
## bathrooms    21560.262   6826.866   3.158   0.0019 ** 
## foreclosure -54169.132  11440.318  -4.735 4.84e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59280 on 158 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7586, Adjusted R-squared:  0.7524 
## F-statistic: 124.1 on 4 and 158 DF,  p-value: < 2.2e-16

R-squared is 75.9%.To increase r-squared, I might add more characteristics that affect house prices, such as neighborhood characteristics(amenities and school quality), time on the market, agent characteristics(age and experience).

To collect agent characteristics, I might use multiple listing service data set. MLS refers to a private database that is developed by real estate brokers in order to provide information about properties for sale. MLS includes several charateristics of agents such as an age, a sex, an experience, a company, etc.