This was the url used for my analysis. I did not get any error while scraping the pages but for some reason I could extract only 9 observations per page (27 in total from three pages).
library(rvest)
library(tidyverse)
library(ggplot2)
library(stringr)
#Fubctions to extract price and other details
get_price <- function(html){
html %>%
html_nodes('.list-card-price') %>%
html_text() %>%
str_trim()
}
get_details <- function(html){
html %>%
html_nodes('.list-card-details') %>%
html_text() %>%
str_trim()
}
# Looping over all pages
price <- c()
details <- c()
for(i in 1:3){
if (i==1){
houses_url <- "https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.771794137449%2C%22north%22%3A34.14426944496286%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22category%22%3A%22cat1%22%2C%22pagination%22%3A%7B%7D%7D"
} else {
houses_url <- paste("https://www.zillow.com/athens-ga/1-_beds/1.0-_baths/",i,"_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-86.8485845023873%2C%22east%22%3A-83.0583012992623%2C%22south%22%3A32.70643278075477%2C%22north%22%3A35.67738868489224%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A1%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A8%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D",sep = "")
}
houses <- read_html(houses_url)
price <- c(price,get_price(houses))
details <- c(details, get_details(houses))
}
# Check that lengths agree
length(price); length(details)
## [1] 27
## [1] 27
# Clean the data
zillow_df <- data.frame(matrix(nrow = 27, ncol = 0))
zillow_df2 <- zillow_df %>%
mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price,"[^0-9]*","")))
head(zillow_df2)
## bedrooms bathrooms sqft price
## 1 3 2 2245 324900
## 2 3 2 1400 225000
## 3 3 2 2766 415000
## 4 4 2 1328 240000
## 5 4 4 NA 1149000
## 6 4 3 2744 399000
g <- ggplot(zillow_df2, aes(x=sqft, y=price, size=sqft, color=as.factor(bedrooms))) +
geom_point()
g
Run a simple OLS command
model <- lm(price ~ sqft + bedrooms + bathrooms, zillow_df2)
model$coefficients
## (Intercept) sqft bedrooms bathrooms
## 142195.9344 293.6045 -129155.2283 51376.6058
summary(model)
##
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms, data = zillow_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -254593 -180133 -36483 124939 404053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 142195.93 178894.86 0.795 0.436028
## sqft 293.60 74.35 3.949 0.000793 ***
## bedrooms -129155.23 58028.98 -2.226 0.037697 *
## bathrooms 51376.61 84707.51 0.607 0.550994
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 205100 on 20 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.6803, Adjusted R-squared: 0.6323
## F-statistic: 14.18 on 3 and 20 DF, p-value: 3.478e-05
On average, having an additional bedroom decreases the price by $129,155 but this result is significant at the 10% level
On average, having an additional bathroom increases the price by $51376 but this result is not signifcant
Goodness of fit - I get an R squared of 68%.
Based on the concept of hedonic pricing, I would include several external factors that would affect the price of the house like crime rate in the neighborhood, accessibility to schools, level of water and air pollution.
Using the address data from zillow, I could georeference it and obtain distance from the nearest school, air pollution and distance from highway which would improve my model.