For the housing search, I chose 3+ bed 1+ bath houses in Athens. This is my “url”. The results spread on 4 pages.
#libraries needed
library(rvest)
library(tidyverse)
library(ggplot2)
# make helpful function to extract house prices, house address, and other house information
# for price
get_price <- function(html){
html%>%
#the relevant tag
html_nodes('.list-card-price')%>%
html_text()%>%
#trimming
str_trim()}
# other house details
get_details <- function(html){
html%>%
#the relevant tag
html_nodes('.list-card-details')%>%
html_text()%>%
#trimming
str_trim()}
# for address
get_addr <- function(html){
html%>%
#the relevant tag
html_nodes('.list-card-addr')%>%
html_text()%>%
#trimming
str_trim()}
# Loop over 4 pages
price <- c()
details <- c()
addr <- c()
for (i in 1:4){
if (i==1){
house_url<-"https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63994072874435%2C%22east%22%3A-83.16615532835372%2C%22south%22%3A33.77400648859495%2C%22north%22%3A34.146472131278216%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
}
else{
house_url <- paste("https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/",i,"_p/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63994072874435%2C%22east%22%3A-83.16615532835372%2C%22south%22%3A33.77400648859495%2C%22north%22%3A34.146472131278216%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D",sep = "")
}
house_html <- read_html(house_url)
price <- c(price, get_price(house_html))
details <- c(details, get_details(house_html))
addr <- c(addr, get_addr(house_html))
}
#checking lengths
length(price)
## [1] 40
length(addr)
## [1] 40
length(details)
## [1] 40
# Clean data
library(stringr)
# Create dataframe with 40 rows and 0 columns
house_df <- data.frame(matrix(nrow = 40, ncol = 0))
# Then clean the file
house_df2 <- house_df %>%
mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price,"[^0-9]*","")))
# Visual analysis
head(house_df2)
## bedrooms bathrooms sqft price
## 1 3 2 1350 225000
## 2 3 3 NA 269900
## 3 3 2 1454 174900
## 4 4 4 2497 345900
## 5 3 2 1832 259900
## 6 4 2 1950 314900
# Simple plot of Square footage vs. price
p <- ggplot(house_df2, aes(x=sqft, y=price, size=sqft, color=as.factor(bedrooms))) +
geom_point()
p
Figure shows that the price of houses increases with the increase in square foot or size of the house.
# lets look at the box plot of prices and square footage to see the outliers
par(mfrow=c(1, 2)) # 1 row, 2 columns
boxplot(house_df2$sqft, main="Sqft") # box plot for sqft
boxplot(house_df2$price, main="Price")
There are some outliers in the house prices
# Run OLS
mod <- lm(price ~ bedrooms + bathrooms + sqft, house_df2)
mod$coefficients
## (Intercept) bedrooms bathrooms sqft
## 309404.2779 -101056.0470 9187.6805 170.9142
summary(mod)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = house_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -183053 -64241 -15448 54422 297598
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 309404.3 84194.6 3.675 0.000925 ***
## bedrooms -101056.1 31064.9 -3.253 0.002824 **
## bathrooms 9187.7 30888.6 0.297 0.768177
## sqft 170.9 28.0 6.103 1.05e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 103000 on 30 degrees of freedom
## (6 observations deleted due to missingness)
## Multiple R-squared: 0.6916, Adjusted R-squared: 0.6608
## F-statistic: 22.43 on 3 and 30 DF, p-value: 8.189e-08
On average, having an additional bedroom in a house would decrease price of the house by roughly 63,591 dollars and it is statistically significant at 5% significance level. Having an additional bathroom would increase price of the house by roughly 16,107 dollars, but it is not statistically significant. Therefore, the effect of number of bedrooms is higher than than the number of bathrooms. Additionally, larger square foot would increase the price of the house and is statistically significant as well.
The R-square is 0.7262 which shows that 72.62% of the variation in house prices is explained by the model.
We could include locality type (based on crime rates, distance to recreational sites and more), house design, availability of facilities (such as car garage, patio, backyard size and more), and many other factors.
Year of house built or lumber prices based on the demand and supply of wood in the market.
Yes, we can obtain year of house built from the zillow and lumber prices for US south specifically from the Timber Mart South.