Zillow is a web-based, leading real estate information service in the United States. We collect data from Zillow to analyze how to determine house prices using hedonic pricing model.
I choose 1+ bedrooms, any bathrooms, and only single-family houses in Duluth, GA. This is the url of my search below:
library(rvest)
library(stringr)
library(dplyr)
library(ggplot2)
library(tidyverse)
The results are spread on 6 pages, so I will scape the page 6 times manually and combine all 6 datasets with rbind command.
Four functions are used to collect data.
get_price <- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-price') %>%
html_text() %>%
str_trim()
}
get_detail <- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-details') %>%
html_text() %>%
str_trim()
}
get_address <- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-addr') %>%
html_text() %>%
str_trim()
}
get_type<- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-type') %>%
html_text() %>%
str_trim()
}
zillow1 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
price1 <- get_price(zillow1)
detail1 <- get_detail(zillow1)
address1 <- get_address(zillow1)
type1 <- get_type(zillow1)
length(price1); length(detail1); length(address1); length(type1)
## [1] 40
## [1] 40
## [1] 40
## [1] 40
zillow2 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/2_p/?searchQueryState={%22pagination%22:{%22currentPage%22:2},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
price2 <- get_price(zillow2)
detail2 <- get_detail(zillow2)
address2 <- get_address(zillow2)
type2 <- get_type(zillow2)
length(price2); length(detail2); length(address2); length(type2)
## [1] 40
## [1] 40
## [1] 40
## [1] 40
zillow3 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/3_p/?searchQueryState={%22pagination%22:{%22currentPage%22:3},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
price3 <- get_price(zillow3)
detail3 <- get_detail(zillow3)
address3 <- get_address(zillow3)
type3 <- get_type(zillow3)
length(price3); length(detail3); length(address3); length(type3)
## [1] 40
## [1] 40
## [1] 40
## [1] 40
zillow4 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/4_p/?searchQueryState={%22pagination%22:{%22currentPage%22:4},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
price4 <- get_price(zillow4)
detail4 <- get_detail(zillow4)
address4 <- get_address(zillow4)
type4 <- get_type(zillow4)
length(price4); length(detail4); length(address4); length(type4)
## [1] 40
## [1] 40
## [1] 40
## [1] 40
zillow5 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/5_p/?searchQueryState={%22pagination%22:{%22currentPage%22:5},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
price5 <- get_price(zillow5)
detail5 <- get_detail(zillow5)
address5 <- get_address(zillow5)
type5 <- get_type(zillow5)
length(price5); length(detail5); length(address5); length(type5)
## [1] 40
## [1] 40
## [1] 40
## [1] 40
zillow6 <- read_html("https://www.zillow.com/duluth-ga/houses/1-_beds/6_p/?searchQueryState={%22pagination%22:{%22currentPage%22:6},%22usersSearchTerm%22:%22duluth,%20ga%22,%22mapBounds%22:{%22west%22:-84.33665031103516,%22east%22:-83.93496268896484,%22south%22:33.891083606886006,%22north%22:34.122182758016194},%22regionSelection%22:[{%22regionId%22:51757,%22regionType%22:6}],%22isMapVisible%22:true,%22mapZoom%22:12,%22filterState%22:{%22beds%22:{%22min%22:1},%22isManufactured%22:{%22value%22:false},%22isCondo%22:{%22value%22:false},%22isMultiFamily%22:{%22value%22:false},%22isApartment%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
price6 <- get_price(zillow6)
detail6 <- get_detail(zillow6)
address6 <- get_address(zillow6)
type6 <- get_type(zillow6)
length(price6); length(detail6); length(address6); length(type6)
## [1] 34
## [1] 34
## [1] 34
## [1] 34
Before merging all 6 pages, I did cut strings in the detail column for each page.
zillow1 <- data_frame(price1,detail1, address1, type1)
zillow1 <- zillow1 %>%
mutate(bedrooms = as.integer(str_trim(str_extract(detail1, "\\d+(?= bds?)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(detail1, "\\d+(?= ba)")))) %>%
mutate(sqft = str_trim(str_extract(detail1, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price1,"[^0-9]*","")))
zillow1 <- zillow1 %>%
rename(
address = address1,
type = type1
)
zillow1 <- zillow1 %>%
select(price, sqft,bedrooms, bathrooms, address, type)
head(zillow1)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 279271 4900 7 5 2153 Pond Rd, Duluth, GA 30~ Pre-foreclosure ~
## 2 260000 1796 3 3 2902 Gravitt Trl, Duluth, G~ House for sale
## 3 220000 1698 4 3 4405 Hopkins Lake Dr, Dulut~ Coming soon
## 4 310000 2160 4 3 2356 Longlake Way, Duluth, ~ House for sale
## 5 210000 1123 2 2 2875 Barnwood Xing, Duluth,~ House for sale
## 6 NA 2440 3 2 2325 Oak Glenn Cir, Duluth,~ Auction
zillow2 <- data_frame(price2,detail2, address2, type2)
zillow2 <- zillow2 %>%
mutate(bedrooms = as.integer(str_trim(str_extract(detail2, "\\d+(?= bds?)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(detail2, "\\d+(?= ba)")))) %>%
mutate(sqft = str_trim(str_extract(detail2, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price2,"[^0-9]*","")))
zillow2 <- zillow2 %>%
rename(
address = address2,
type = type2
)
zillow2 <- zillow2 %>%
select(price, sqft, bedrooms, bathrooms, address, type)
head(zillow2)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 619000 4536 4 6 3806 Turnberry Ct, Duluth, GA 3~ House for sa~
## 2 425000 3867 5 5 1905 Noblin Ridge Trl, Duluth, ~ House for sa~
## 3 788425 5115 5 5 8930 Moor Park Run, Duluth, GA ~ Pre-foreclos~
## 4 494900 3106 4 4 Everglade W/Basement Plan, Sout~ New construc~
## 5 475000 4445 6 5 2719 Cedar Kay Trl, Duluth, GA ~ House for sa~
## 6 409000 3525 4 4 1668 Westvale Pl, Duluth, GA 30~ House for sa~
zillow3 <- data_frame(price3,detail3, address3, type3)
zillow3 <- zillow3 %>%
mutate(bedrooms = as.integer(str_trim(str_extract(detail3, "\\d+(?= bds?)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(detail3, "\\d+(?= ba)")))) %>%
mutate(sqft = str_trim(str_extract(detail3, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price3,"[^0-9]*","")))
zillow3 <- zillow3 %>%
rename(
address = address3,
type = type3
)
zillow3 <- zillow3 %>%
select(price, sqft, bedrooms, bathrooms, address, type)
head(zillow3)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 670000 6010 6 7 7950 Chancery Rdg, Duluth, GA 3~ House for s~
## 2 350000 3028 4 3 4880 Racquet Ct, Duluth, GA 300~ House for s~
## 3 1125000 7689 6 8 2837 Major Ridge Trl, Duluth, G~ House for s~
## 4 1469000 9248 6 8 2942 Darlington Run, Duluth, GA~ House for s~
## 5 449000 3324 5 5 2540 Northmont Pkwy, Duluth, GA~ House for s~
## 6 925000 6056 5 5 8980 Moor Park Run, Duluth, GA ~ House for s~
zillow4 <- data_frame(price4,detail4, address4, type4)
zillow4 <- zillow4 %>%
mutate(bedrooms = as.integer(str_trim(str_extract(detail4, "\\d+(?= bds?)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(detail4, "\\d+(?= ba)")))) %>%
mutate(sqft = str_trim(str_extract(detail4, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price4,"[^0-9]*","")))
zillow4 <- zillow4 %>%
rename(
address = address4,
type = type4
)
zillow4 <- zillow4 %>%
select(price, sqft, bedrooms, bathrooms, address, type)
head(zillow4)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 1190000 5599 5 7 3299 Carmichael Pl, Duluth~ House for sale
## 2 569900 3619 4 5 601 Astley Dr, Johns Creek~ New construction
## 3 1099000 7085 5 7 1855 Sugarloaf Club Dr, Du~ House for sale
## 4 450290 3337 4 4 4492 Claiborne Ct, Duluth,~ New construction
## 5 466020 3337 4 4 4491 Claiborne Ct, Duluth,~ New construction
## 6 340957 2658 4 3 3165 Oak Hampton Way, Dulu~ Pre-foreclosure ~
zillow5 <- data_frame(price5,detail5, address5, type5)
zillow5 <- zillow5 %>%
mutate(bedrooms = as.integer(str_trim(str_extract(detail5, "\\d+(?= bds?)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(detail5, "\\d+(?= ba)")))) %>%
mutate(sqft = str_trim(str_extract(detail5, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price5,"[^0-9]*","")))
zillow5 <- zillow5 %>%
rename(
address = address5,
type = type5
)
zillow5 <- zillow5 %>%
select(price, sqft, bedrooms, bathrooms, address, type)
head(zillow5)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 304900 1636 2 2 2259 Alnwick Dr, Duluth, G~ House for sale
## 2 254798 2827 4 3 4424 Old Norcross Rd, Dulu~ Pre-foreclosure ~
## 3 3990000 13756 6 8 3977 Sweet Bottom Dr, Dulu~ House for sale
## 4 439900 3330 4 4 4542 Claiborne Ct, Duluth,~ New construction
## 5 199961 1696 3 2 2075 Executive Dr, Duluth,~ Pre-foreclosure ~
## 6 253559 2200 4 3 2880 Gravitt Rd, Duluth, G~ Pre-foreclosure ~
zillow6 <- data_frame(price6,detail6, address6, type6)
zillow6 <- zillow6 %>%
mutate(bedrooms = as.integer(str_trim(str_extract(detail6, "\\d+(?= bds?)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(detail6, "\\d+(?= ba)")))) %>%
mutate(sqft = str_trim(str_extract(detail6, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price6,"[^0-9]*","")))
zillow6 <- zillow6 %>%
rename(
address = address6,
type = type6
)
zillow6 <- zillow6 %>%
select(price, sqft, bedrooms, bathrooms, address, type)
head(zillow6)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 222500 1320 3 2 2606 Meadow Ridge Dr, Dulu~ House for sale
## 2 1274800 7431 7 8 8765 Colonial Pl, Duluth, ~ House for sale
## 3 1200000 8397 6 8 8790 Colonial Pl, Duluth, ~ House for sale
## 4 775000 5979 6 7 3824 Saint Annes Ct, Dulut~ House for sale
## 5 260519 2054 3 3 2805 Shelter Cv, Duluth, G~ Pre-foreclosure ~
## 6 268374 1970 3 3 3120 Bugle Dr, Duluth, GA ~ Pre-foreclosure ~
This is the final data set.
zillow_total <- rbind(zillow1, zillow2, zillow3, zillow4, zillow5, zillow6)
head(zillow_total)
## # A tibble: 6 x 6
## price sqft bedrooms bathrooms address type
## <dbl> <dbl> <int> <int> <chr> <chr>
## 1 279271 4900 7 5 2153 Pond Rd, Duluth, GA 30~ Pre-foreclosure ~
## 2 260000 1796 3 3 2902 Gravitt Trl, Duluth, G~ House for sale
## 3 220000 1698 4 3 4405 Hopkins Lake Dr, Dulut~ Coming soon
## 4 310000 2160 4 3 2356 Longlake Way, Duluth, ~ House for sale
## 5 210000 1123 2 2 2875 Barnwood Xing, Duluth,~ House for sale
## 6 NA 2440 3 2 2325 Oak Glenn Cir, Duluth,~ Auction
ggplot(zillow_total, aes(y=price, x=sqft)) +
geom_point(aes(color=as.factor(bedrooms))) +
labs(y="Price", x="Square Footage") +
ggtitle("Scatter plot betwwen price and square footage")
We find that there are some outliers from the scatterplot.
ggplot(zillow_total, aes(type, price))+
geom_boxplot(outlier.color="red") +
labs(x="Type", y="Price")
I create a dummy variable for foreclosure to examine the negative effect of foreclosure on house prices.
zillow_total$foreclosure <- as.character(zillow_total$type)
zillow_total$foreclosure[zillow_total$foreclosure == "House for sale"] <- 0
zillow_total$foreclosure[zillow_total$foreclosure == "For sale by owner"] <- 0
zillow_total$foreclosure[zillow_total$foreclosure == "New construction"] <- 0
zillow_total$foreclosure[zillow_total$foreclosure == "Pre-foreclosure / Auction"] <- 1
zillow_total$foreclosure <- as.numeric(zillow_total$foreclosure)
summary(zillow_total)
## price sqft bedrooms bathrooms
## Min. : 109105 Min. : 980 Min. :2.000 Min. : 1.000
## 1st Qu.: 285617 1st Qu.: 2157 1st Qu.:3.000 1st Qu.: 3.000
## Median : 417461 Median : 2985 Median :4.000 Median : 4.000
## Mean : 627671 Mean : 3946 Mean :4.342 Mean : 4.393
## 3rd Qu.: 775000 3rd Qu.: 5008 3rd Qu.:5.000 3rd Qu.: 5.000
## Max. :4975000 Max. :13756 Max. :7.000 Max. :13.000
## NA's :4 NA's :6 NA's :5
## address type foreclosure
## Length:234 Length:234 Min. :0.0000
## Class :character Class :character 1st Qu.:0.0000
## Mode :character Mode :character Median :0.0000
## Mean :0.2087
## 3rd Qu.:0.0000
## Max. :1.0000
## NA's :4
model <- lm(price ~ sqft+bedrooms+bathrooms+foreclosure, zillow_total)
model$coefficients
## (Intercept) sqft bedrooms bathrooms foreclosure
## 127112.0884 198.1076 -156054.7592 87918.3795 -41347.2539
summary(model)
##
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms + foreclosure,
## data = zillow_total)
##
## Residuals:
## Min 1Q Median 3Q Max
## -718028 -126741 -5845 93073 2221934
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 127112.09 97532.16 1.303 0.19388
## sqft 198.11 18.79 10.540 < 2e-16 ***
## bedrooms -156054.76 29207.57 -5.343 2.34e-07 ***
## bathrooms 87918.38 24910.42 3.529 0.00051 ***
## foreclosure -41347.25 52979.57 -0.780 0.43600
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 291300 on 213 degrees of freedom
## (16 observations deleted due to missingness)
## Multiple R-squared: 0.789, Adjusted R-squared: 0.785
## F-statistic: 199.1 on 4 and 213 DF, p-value: < 2.2e-16
“sqft”, “bedrooms”, and “bathrooms” are statistically significant at 1% level. However, “bedrooms” has a negative sign which I did not expect to have. “foreclosure” has a negative sign which I expected to have, but it is not statistically significant. So, I remove some outliers on price above 75th percential.
zillow75 <- filter(zillow_total, price<758751)
model_1 <- lm(price ~ sqft+bedrooms+bathrooms+foreclosure, zillow75)
model_1$coefficients
## (Intercept) sqft bedrooms bathrooms foreclosure
## 100440.81409 77.72874 -4006.80726 21560.26174 -54169.13177
summary(model_1)
##
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms + foreclosure,
## data = zillow75)
##
## Residuals:
## Min 1Q Median 3Q Max
## -227625 -32652 -3611 28336 214538
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 100440.814 23135.273 4.341 2.52e-05 ***
## sqft 77.729 8.244 9.428 < 2e-16 ***
## bedrooms -4006.807 8081.041 -0.496 0.6207
## bathrooms 21560.262 6826.866 3.158 0.0019 **
## foreclosure -54169.132 11440.318 -4.735 4.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59280 on 158 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7586, Adjusted R-squared: 0.7524
## F-statistic: 124.1 on 4 and 158 DF, p-value: < 2.2e-16
On average, an additional bedroom in a home has -$4,006.81. But it is not statistically significant.
On average, an additional bathroom in a home has $21,560.26.
On average, foreclosure house sell at discount $54,169.13.
R-squared is 75.9%.To increase r-squared, I might add more characteristics that affect house prices, such as neighborhood characteristics(amenities and school quality), time on the market, agent characteristics(age and experience).
To collect agent characteristics, I might use multiple listing service data set. MLS refers to a private database that is developed by real estate brokers in order to provide information about properties for sale. MLS includes several charateristics of agents such as an age, a sex, an experience, a company, etc.