#loading necessary packages
library(rvest)
library(tidyverse)
library(XML)
library(xml2)
## defining the function to get price data
get_price <- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-price') %>%
html_text() %>%
str_trim()
}
#defining new vectors
HP <- vector(mode = "numeric")
new_scraped_HP <- vector(mode = "numeric")
#running a loop to retrieve data from all the pages
for (i in 1:4){
if (i==1){
house_url <- "https://www.zillow.com/athens-ga/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
}
else {
house_url <- paste ("https://www.zillow.com/athens-ga/",i,"_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D", sep = " ", collapse = NULL)
}
house_html <- read_html(house_url)
# Store the retrieved data
new_scraped_HP <- get_price(house_html)
# Append all the retrieved data
HP <- c(HP, new_scraped_HP)
}
HPdata <- data.frame(HP)
#removing "$" and ,(comma) from the data
HPdata$price = as.numeric(gsub("[\\$,]", "", HPdata$HP))
dim(HPdata)
## [1] 36 2
## defining the function to get house attributes/details data
get_detail <- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-details') %>%
html_text() %>%
str_trim()
}
#defining new vectors
HD <- vector(mode = "numeric")
new_scraped_HD <- vector(mode = "numeric")
for (i in 1:4){
if (i==1){
house_url <- "https://www.zillow.com/athens-ga/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
}
else {
house_url <- paste ("https://www.zillow.com/athens-ga/",i,"_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D", sep = " ", collapse = NULL)
}
house_html <- read_html(house_url)
# Store the retrieved data
new_scraped_HD <- get_detail(house_html)
# Append all the retrieved data
HD <- c(HD, new_scraped_HD)
}
final_HD <- data.frame(HD)
dim(final_HD)
## [1] 36 1
##cleaning the dataset for housing details
final_HD_f <- final_HD %>%
mutate(bedrooms = as.integer(str_trim(str_extract(final_HD$HD, "[\\d ]*(?=bds)")))) %>%
mutate(bathrooms = as.integer(str_trim(str_extract(final_HD$HD, "[\\d ]*(?=ba)")))) %>%
mutate(sqft = str_trim(str_extract(final_HD$HD, "[\\d ,]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",","")))
head(final_HD_f, 4)
## HD bedrooms bathrooms sqft
## 1 3 bds2 ba2,245 sqft- House for sale 3 2 2245
## 2 3 bds2 ba1,400 sqft- House for sale 3 2 1400
## 3 4 bds2 ba1,328 sqft- House for sale 4 2 1328
## 4 4 bds4 ba-- sqft- House for sale 4 4 NA
##Defining function for House Address
get_address <- function(html){
html %>%
# The relevant tag
html_nodes('.list-card-addr') %>%
html_text() %>%
str_trim()
}
#defining new vectors
HA <- vector(mode = "numeric")
new_scraped_HA <- vector(mode = "numeric")
for (i in 1:4){
if (i==1){
house_url <- "https://www.zillow.com/athens-ga/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
}
else {
house_url <- paste ("https://www.zillow.com/athens-ga/",i,"_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D", sep = " ", collapse = NULL)
}
house_html <- read_html(house_url)
# Store the retrieved data
new_scraped_HA <- get_address(house_html)
# Append all the retrieved data
HA <- c(HA, new_scraped_HA)
}
final_HA <- data.frame(HA)
dim(final_HA)
## [1] 36 1
##cleaning the dataset for zipcode
st_zip <- setNames(data.frame(str_extract_all(final_HA$HA,"[A-Z][A-Z]\\s\\d{4,5}",simplify = T)),"St_zip")
st_zip <- st_zip[st_zip$St_zip != "",]
df1 <- setNames(data.frame(do.call("rbind",strsplit(st_zip,split=' '))),c("State","Zip"))
sort(df1$Zip)
## [1] "30601" "30601" "30601" "30601" "30601" "30601" "30601" "30601" "30601"
## [10] "30601" "30605" "30605" "30605" "30605" "30605" "30605" "30605" "30605"
## [19] "30605" "30605" "30605" "30605" "30606" "30606" "30606" "30606" "30606"
## [28] "30606" "30606" "30606" "30606" "30606" "30606" "30606" "30606" "30607"
#converting the data as a categorical variable
df1$Zip <- factor(df1$Zip, levels = c("30601", "30605", "30606", "30607"))
summary(df1$Zip)
## 30601 30605 30606 30607
## 10 12 13 1
zillow_df <- cbind(HPdata, final_HD_f, final_HA, df1)
dim(zillow_df)
## [1] 36 9
Simple plot for square footage vs. price of the house
ggplot(zillow_df, aes(x=sqft, y=price, size=sqft, color=as.factor(bedrooms), na.rm=TRUE)) +
geom_point()+xlab("Square footage")+ylab("Price")+ggtitle("Relationship between sqaure footage and price of house")
The above graph exhibits that housing prices are linearly related with the square footage of the house and no of bedrooms. Higher the square footage of the house and the more bedrooms, the higher will be the price of that house. However, we can see some outliers in our plot where 4 bedrooms house has a much higher value compared to other houses in the dataset.
hedonic_model <- lm(price~bedrooms+bathrooms+sqft, data=zillow_df)
summary(hedonic_model)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = zillow_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -355931 -158739 -93666 142704 311850
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 379431.75 158802.29 2.389 0.02509 *
## bedrooms -142969.87 50210.05 -2.847 0.00889 **
## bathrooms -102659.27 72326.26 -1.419 0.16864
## sqft 416.76 71.09 5.862 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 207200 on 24 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.7242, Adjusted R-squared: 0.6898
## F-statistic: 21.01 on 3 and 24 DF, p-value: 6.729e-07
Having an additional bedroom in a home lowers the house price by $142969.87 on average. However, this results a bit unusual because having an additional bedroom in a home is supposed the increase the house price. The negative sign of the parameter estimate can be explained the by outlier value in our dataset (the previous graph for price vs. square footage and no of bedrooms)
Also, having an additional bathroom lowers the value of a home by $102659.27 on average. For additional square footage of the house, the value of home increases by $416.76 on average.
The R-squared value of the model is 0.7242 that is about 72.42% variation of the data can be explained by the model.
Now, adding the zipcode variable to check whether having an locational attribute can increase the goodness-of-fit of the model
hedonic_model_2 <- lm(price~bedrooms+bathrooms+sqft+Zip, data=zillow_df)
summary(hedonic_model_2)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft + Zip, data = zillow_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -452298 -40749 11347 77692 150212
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1362331.70 228950.32 5.950 6.61e-06 ***
## bedrooms -308954.80 49663.78 -6.221 3.59e-06 ***
## bathrooms -382279.31 78224.33 -4.887 7.83e-05 ***
## sqft 821.05 99.01 8.293 4.61e-08 ***
## Zip30605 -736962.14 153313.79 -4.807 9.47e-05 ***
## Zip30606 -534921.25 115565.77 -4.629 0.000145 ***
## Zip30607 -595375.20 185215.82 -3.214 0.004161 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 150200 on 21 degrees of freedom
## (8 observations deleted due to missingness)
## Multiple R-squared: 0.8732, Adjusted R-squared: 0.837
## F-statistic: 24.1 on 6 and 21 DF, p-value: 2.15e-08
The R-squared value of the model with zipcode is 0.8732, adding the locational feature to model increased the goodness-of-fit
One of the important predictors of a hedonic pricing model would be accessibility such as distance from workplace or CBD (central business district), schools, park etc. Moreover, environmental factors such air quality can be another predictor to the model. In our above model we can add another varibale named “Air Quality” which can be retrieved from this website which provides air quality index values (lowest values indicates good air quality) for each zipcode in USA.