Task: Analyse the Athens Real Estate Market

#loading necessary packages
library(rvest)
library(tidyverse)
library(XML)
library(xml2)

Do some cleaning

Cleaning the dataset for housing attributes and creating new variables

##cleaning the dataset for housing details 

final_HD_f <- final_HD %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(final_HD$HD, "[\\d ]*(?=bds)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(final_HD$HD, "[\\d ]*(?=ba)")))) %>%
  mutate(sqft = str_trim(str_extract(final_HD$HD, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) 
head(final_HD_f, 4)
##                                    HD bedrooms bathrooms sqft
## 1 3 bds2 ba2,245 sqft- House for sale        3         2 2245
## 2 3 bds2 ba1,400 sqft- House for sale        3         2 1400
## 3 4 bds2 ba1,328 sqft- House for sale        4         2 1328
## 4    4 bds4 ba-- sqft- House for sale        4         4   NA

Getting data for housing address

##Defining function for House Address

get_address <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-addr') %>%
    html_text() %>%
    str_trim()
}
#defining new vectors 
HA <- vector(mode = "numeric")
new_scraped_HA <- vector(mode = "numeric")

Retrieving data from all the pages

for (i in 1:4){
  if (i==1){
    house_url <- "https://www.zillow.com/athens-ga/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"
  }
  else {
    house_url <- paste ("https://www.zillow.com/athens-ga/",i,"_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%2C%20GA%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.63818270019532%2C%22east%22%3A-83.16439729980469%2C%22south%22%3A33.78092612632548%2C%22north%22%3A34.13517637488345%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D", sep = " ", collapse = NULL)
  }
  house_html <- read_html(house_url)
  
  # Store the retrieved data
  new_scraped_HA <- get_address(house_html)
  
  # Append all the retrieved data
  HA <- c(HA, new_scraped_HA)
}

final_HA <- data.frame(HA)

dim(final_HA)
## [1] 36  1

Do some cleaning

Extracting the zipcode of every house from the address variable to use as a locational attribute

##cleaning the dataset for zipcode 
st_zip <- setNames(data.frame(str_extract_all(final_HA$HA,"[A-Z][A-Z]\\s\\d{4,5}",simplify = T)),"St_zip")
st_zip <- st_zip[st_zip$St_zip != "",]
df1 <- setNames(data.frame(do.call("rbind",strsplit(st_zip,split=' '))),c("State","Zip"))
sort(df1$Zip)
##  [1] "30601" "30601" "30601" "30601" "30601" "30601" "30601" "30601" "30601"
## [10] "30601" "30605" "30605" "30605" "30605" "30605" "30605" "30605" "30605"
## [19] "30605" "30605" "30605" "30605" "30606" "30606" "30606" "30606" "30606"
## [28] "30606" "30606" "30606" "30606" "30606" "30606" "30606" "30606" "30607"
#converting the data as a categorical variable
df1$Zip <- factor(df1$Zip, levels = c("30601", "30605", "30606", "30607"))

summary(df1$Zip)
## 30601 30605 30606 30607 
##    10    12    13     1

Combining the data set for housing prices, details, address and zipcode

zillow_df <- cbind(HPdata, final_HD_f, final_HA, df1)
dim(zillow_df)
## [1] 36  9

Visualization

Simple plot for square footage vs. price of the house

ggplot(zillow_df, aes(x=sqft, y=price, size=sqft, color=as.factor(bedrooms), na.rm=TRUE)) + 
  geom_point()+xlab("Square footage")+ylab("Price")+ggtitle("Relationship between sqaure footage and price of house")

The above graph exhibits that housing prices are linearly related with the square footage of the house and no of bedrooms. Higher the square footage of the house and the more bedrooms, the higher will be the price of that house. However, we can see some outliers in our plot where 4 bedrooms house has a much higher value compared to other houses in the dataset.

Regression analysis

hedonic_model <- lm(price~bedrooms+bathrooms+sqft, data=zillow_df)
summary(hedonic_model)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = zillow_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -355931 -158739  -93666  142704  311850 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  379431.75  158802.29   2.389  0.02509 *  
## bedrooms    -142969.87   50210.05  -2.847  0.00889 ** 
## bathrooms   -102659.27   72326.26  -1.419  0.16864    
## sqft            416.76      71.09   5.862 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 207200 on 24 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.7242, Adjusted R-squared:  0.6898 
## F-statistic: 21.01 on 3 and 24 DF,  p-value: 6.729e-07

Having an additional bedroom in a home lowers the house price by $142969.87 on average. However, this results a bit unusual because having an additional bedroom in a home is supposed the increase the house price. The negative sign of the parameter estimate can be explained the by outlier value in our dataset (the previous graph for price vs. square footage and no of bedrooms)

Also, having an additional bathroom lowers the value of a home by $102659.27 on average. For additional square footage of the house, the value of home increases by $416.76 on average.

Goodness of fit

The R-squared value of the model is 0.7242 that is about 72.42% variation of the data can be explained by the model.

Regression model with zipcode

Now, adding the zipcode variable to check whether having an locational attribute can increase the goodness-of-fit of the model

hedonic_model_2 <- lm(price~bedrooms+bathrooms+sqft+Zip, data=zillow_df)
summary(hedonic_model_2)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft + Zip, data = zillow_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -452298  -40749   11347   77692  150212 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1362331.70  228950.32   5.950 6.61e-06 ***
## bedrooms    -308954.80   49663.78  -6.221 3.59e-06 ***
## bathrooms   -382279.31   78224.33  -4.887 7.83e-05 ***
## sqft            821.05      99.01   8.293 4.61e-08 ***
## Zip30605    -736962.14  153313.79  -4.807 9.47e-05 ***
## Zip30606    -534921.25  115565.77  -4.629 0.000145 ***
## Zip30607    -595375.20  185215.82  -3.214 0.004161 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 150200 on 21 degrees of freedom
##   (8 observations deleted due to missingness)
## Multiple R-squared:  0.8732, Adjusted R-squared:  0.837 
## F-statistic:  24.1 on 6 and 21 DF,  p-value: 2.15e-08

The R-squared value of the model with zipcode is 0.8732, adding the locational feature to model increased the goodness-of-fit

One of the important predictors of a hedonic pricing model would be accessibility such as distance from workplace or CBD (central business district), schools, park etc. Moreover, environmental factors such air quality can be another predictor to the model. In our above model we can add another varibale named “Air Quality” which can be retrieved from this website which provides air quality index values (lowest values indicates good air quality) for each zipcode in USA.