NOTE: Done with help of Hari.
library(rvest)
library(tidyverse)
library(ggplot2)
library(stringr)
price_ph <- function(html){
html %>%
html_nodes('.list-card-price') %>%
html_text() %>%
str_trim()
}
attrib_ph <- function(html){
html %>%
html_nodes('.list-card-details') %>%
html_text() %>%
str_trim()
}
#Looping
price <- c()
details <- c()
for(i in 1:3){
if (i==1){
houses_url <- "https://www.zillow.com/bozeman-mt/3-_beds/2.0-_baths/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Bozeman%2C%20MT%22%2C%22mapBounds%22%3A%7B%22west%22%3A-112.06471723828125%2C%22east%22%3A-109.90590376171875%2C%22south%22%3A45.1519196193617%2C%22north%22%3A46.33753680187748%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A44281%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A2%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%2C%22price%22%3A%7B%22max%22%3A1000000%7D%2C%22mp%22%3A%7B%22max%22%3A4055%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A9%7D"
} else {
houses_url <- paste("https://www.zillow.com/homes/for_sale/1-_beds/1.0-_baths/?searchQueryState=%7B%22mapBounds%22%3A%7B%22west%22%3A-112.06471723828125%2C%22east%22%3A-109.90590376171875%2C%22south%22%3A45.1519196193617%2C%22north%22%3A46.33753680187748%7D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22price%22%3A%7B%22min%22%3A0%2C%22max%22%3A1500000%7D%2C%22mp%22%3A%7B%22min%22%3A0%2C%22max%22%3A6082%7D%2C%22beds%22%3A%7B%22min%22%3A1%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A9%2C%22pagination%22%3A%7B%7D%7D",sep = "")
}
houses <- read_html(houses_url)
price <- c(price,price_ph(houses))
details <- c(details, attrib_ph(houses))
}
zillow_df <- data.frame(matrix(nrow = 27, ncol = 0))
zillow_df2 <- zillow_df %>%
mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(price,"[^0-9]*","")))
#Plotting the area of the house against its price
pl <- ggplot(zillow_df2, aes(x=sqft, y=price, size=sqft, color=as.factor(bedrooms))) +
geom_point()
pl
#Regressing price on the house attributes
reg <- lm(price ~ sqft + bedrooms + bathrooms, zillow_df2)
reg$coefficients
## (Intercept) sqft bedrooms bathrooms
## 121600.03565 30.31374 115864.42458 30280.67622
summary(reg)
##
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms, data = zillow_df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -103083 -50123 -14497 57470 104110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121600.04 88797.39 1.369 0.18469
## sqft 30.31 11.49 2.639 0.01498 *
## bedrooms 115864.42 32578.18 3.557 0.00177 **
## bathrooms 30280.68 34590.66 0.875 0.39081
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70420 on 22 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.6485, Adjusted R-squared: 0.6006
## F-statistic: 13.53 on 3 and 22 DF, p-value: 3.22e-05