Reference and disclaimer: This homework is attempting to webscrap the housing data from the website “Zillow” (http://www.zillow.com/).

1. Scrapped Data

content

First of all, from that site, we targeted all houses on sale within the boundary of Athens, Georgia, US which has more than 1 bedroom. Second, from those houses, we scrapped the data on price, the number of bedrooms, the number of bathrooms, sqft, and address. Third, we need to collect the data from 10 web-pages. So, by using the loop, we simplified the code. The R code is as follows.(Here, through the next tab, you can see the definition of the variable “athens” )

## Scrapping function
athens <- c(athens1, athens2, athens3, athens4, athens5, athens6, athens7, athens8, athens9, athens10)

get_price <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-price') %>%
    html_text() %>%
    str_trim()
}

get_details <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-details') %>%
    html_text() %>%
    str_trim()
}

get_addresses <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-addr') %>%
    html_text() %>%
    str_trim()
}

## collecting data by looping
addresses=c()
p=c()
details=c()

for (i in 1:10) {
  d <- read_html(athens[i])
  d_scrapped <- get_details(d)
  p_scrapped <- get_price(d)
  a_scrapped <- get_addresses(d)
  p <- c(p, p_scrapped)
  details <- c(details, d_scrapped)
  addresses <- c(addresses, a_scrapped)
}

The R code for “athens”

athens1 <- "https://www.zillow.com/athens-ga/houses/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.41603485925962,%22north%22:34.498134675837406},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens2 <- "https://www.zillow.com/athens-ga/houses/2_p/?searchQueryState={%22pagination%22:{%22currentPage%22:2},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens3 <- "https://www.zillow.com/athens-ga/houses/3_p/?searchQueryState={%22pagination%22:{%22currentPage%22:3},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens4 <- "https://www.zillow.com/athens-ga/houses/4_p/?searchQueryState={%22pagination%22:{%22currentPage%22:4},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens5 <- "https://www.zillow.com/athens-ga/houses/5_p/?searchQueryState={%22pagination%22:{%22currentPage%22:5},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens6 <- "https://www.zillow.com/athens-ga/houses/6_p/?searchQueryState={%22pagination%22:{%22currentPage%22:6},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens7 <- "https://www.zillow.com/athens-ga/houses/7_p/?searchQueryState={%22pagination%22:{%22currentPage%22:7},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens8 <- "https://www.zillow.com/athens-ga/houses/8_p/?searchQueryState={%22pagination%22:{%22currentPage%22:8},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens9 <- "https://www.zillow.com/athens-ga/houses/9_p/?searchQueryState={%22pagination%22:{%22currentPage%22:9},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
athens10 <- "https://www.zillow.com/athens-ga/houses/10_p/?searchQueryState={%22pagination%22:{%22currentPage%22:10},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"

2. Data cleaning

Scrapped data has some kinds of formats like nummeric values, english charcters, special characters such as comma. So, each variable should be cleaned in the form that we want. The R code for data cleaning is as follows:

housing_data <- tibble(p,details,addresses)
housing_data <- housing_data %>%
  mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
  mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
  mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(p,"[^0-9]*","")))

3. Visualization

Plot

Our code

par(mfrow=c(2,3)) 
hist(housing_data$sqft, main="freq. of sqft", xlab="sqft")
hist(housing_data$bedrooms, main="freq. of bedrooms", xlab="bedrooms")
hist(housing_data$bathrooms, main="freq. of bathrooms", xlab="bathrooms")
plot(housing_data$sqft, housing_data$price, xlab="sqft", ylab="price")
plot(housing_data$bedrooms, housing_data$price, xlab="bedrooms", ylab="price")
plot(housing_data$bathrooms, housing_data$price, xlab="bathrooms", ylab="price")

4. Regression

We conducted the linear regression analysis. The dependent variable is “price”, and the independent variables are “bedrooms”, “bathroomes”, and “sqft”, including the constant. Based on the estimation results, on average, the price decreases by $45,966 as an additional bedroom increases in a home, and the price increases by $42,133 as an additional bathroom increases in a home. In particular, the sign of “bedrooms” are inconsistent with our common senses. Also, as for the bathroom, the coefficient is overestimated. According to this article, if you are converting existing space to a half-bath, you may end up spending **$5,000 or less*". If you have to add on to your house, figure on spending at least $25,000** for a bathroom. This is because of omitted variable bias, where omitted variables are correlated with the independt variables as well as the dependent variable. This causes coefficients to be biased and inconsistent. Also, the estimation results is as follows:

Results

## (Intercept)        sqft    bedrooms   bathrooms 
##  36604.0788    155.7845 -52748.7755  50586.3706

Our code

model <- lm(price ~ sqft + bedrooms + bathrooms, housing_data)
model$coefficients

5. Goodnees of fit

Contents

\(R^{2}\) of our model is approximately 0.67. I would call ok-but-not great. Therefore, based on the omitted variables and the level of \(R^{2}\), we have a spare room to add more independent or control variables. To increase the fit, we can consider not only the internal and external physical characteristics of the house but also environmental amenities. As for the physical characteristics, The Moodys/REAL commercial property index (CPPI), Energy Star or LEED certified, Parking Ration, Stories, Year Renovated, Year Built, security system built-in, distance from the police station, swimming pool or fitness center in the complex and so on. As for the concept of hedonic pricing, educatioanl environment, specifically distance from UGA, will influence the housing price. Therefore, this data can be collected through the google map with the address of each house.

Our results and codes

summary(model)
## 
## Call:
## lm(formula = price ~ sqft + bedrooms + bathrooms, data = housing_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -507737  -79145  -18367   41322  836037 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36604.1    29800.5   1.228     0.22    
## sqft           155.8       10.9  14.289  < 2e-16 ***
## bedrooms    -52748.8    11578.8  -4.556 7.31e-06 ***
## bathrooms    50586.4    10621.8   4.762 2.85e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 146600 on 337 degrees of freedom
##   (46 observations deleted due to missingness)
## Multiple R-squared:  0.7093, Adjusted R-squared:  0.7067 
## F-statistic: 274.1 on 3 and 337 DF,  p-value: < 2.2e-16

6-1. Bonus tasks

Contents

First of all, based the total number of results scrapped on the main page, we calculated the total number of pages using the information 40 results per page. Second, using the loop and paste function, we generated the variables of page’s web addresses, and created the vector of page’s web addresses. The remaining processes are the same as the above.

our code

### main page
athens1 <- "https://www.zillow.com/athens-ga/houses/?searchQueryState={%22pagination%22:{},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.41603485925962,%22north%22:34.498134675837406},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"

### The total number of results and the number of pages
get_results <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.result-count') %>%
    html_text() %>%
    str_trim()
}

aa <- read_html(athens1)
n_results <- get_results(aa)
n_results <- as.numeric(str_replace_all(n_results,"[^0-9]*",""))
n_page <- ceiling(n_results/40)

### generate the web page variable

athens_before <- "https://www.zillow.com/athens-ga/houses/"
athens_after <- "},%22usersSearchTerm%22:%22athens%22,%22mapBounds%22:{%22west%22:-84.12844709277344,%22east%22:-82.67275861621094,%22south%22:33.419851898892745,%22north%22:34.494365709232525},%22mapZoom%22:9,%22regionSelection%22:[{%22regionId%22:23534,%22regionType%22:6}],%22isMapVisible%22:true,%22filterState%22:{%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"

athens=c(athens1)
for (i in 2:n_page) {
  athens_middle <- paste(i,"_p/?searchQueryState={%22pagination%22:{%22currentPage%22:",i,sep="")
  athens_pages <- paste(athens_before,athens_middle,athens_after,sep="") 
  athens <- c(athens, athens_pages)
}

## Scrapping function

get_price <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-price') %>%
    html_text() %>%
    str_trim()
}

get_details <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-details') %>%
    html_text() %>%
    str_trim()
}

get_addresses <- function(html){
  html %>%
    # The relevant tag
    html_nodes('.list-card-addr') %>%
    html_text() %>%
    str_trim()
}

## collecting data by looping
addresses=c()
p=c()
details=c()

for (i in 1:n_page) {
  d <- read_html(athens[i])
  d_scrapped <- get_details(d)
  p_scrapped <- get_price(d)
  a_scrapped <- get_addresses(d)
  p <- c(p, p_scrapped)
  details <- c(details, d_scrapped)
  addresses <- c(addresses, a_scrapped)
}

c(length(p), length(details), length(addresses))
## [1] 347 347 347

6-2. The possible hedonic regression from the page

Also, we have the information on the address of each house. Basically, we can find the accessibility to spaces for recreational activities like the parks, lakes, rivers, and so on. If we extend our research boundary to wider areas, we can match the addresses with pollution levels for airs and waters, unpleasant facilities like landfill and waste incineration plant.