Task: Analyse the Athens Real Estate Market

I downloaded the data and saved it as a CSV. I will use the cleaned CSV to do my homework.

 #load packages
#  rm(list = ls(all.names = TRUE))
# necessaryPackages <-c("lattice", "ggplot2", "stargazer", "did", "tree",  "raster", "devtools", "purr", "dplyr", #"tidyverse", "magrittr", "hrbrthemes", "MatchIt", "glmnet", "rvest")
# new.packages <- necessaryPackages[!(necessaryPackages%in% installed.packages()[,"Package"])]
# if(length(new.packages))
#   install.packages(new.packages, repos = "http://cran.us.r-project.org")
# lapply(necessaryPackages, require, character.only = TRUE)
# ## Create function for price
# price <- function(html){
#   html%>%
#     #the relevant tag
#     html_nodes('.list-card-price')%>%
#     html_text()%>%
#     #trimming
#     str_trim()}
## 
# Create function for other details
# Details of the house (bathrooms, bedrooms, square footage, anything else you want)

# details <- function(html){
#   html%>%
#     #the relevant tag
#     html_nodes('.list-card-details')%>%
#     html_text()%>%
#     #trimming
#     str_trim()}

#loop over the four pages

# for (i in 1:4){
#   if (i==1){
#     house_url<-"https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Athens%20Ga%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.80297712207032%2C%22east%22%3A-82.99960187792969%2C%22south%22%3A33.72954602872359%2C%22north%22%3A34.18631216297461%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"}
#   else{
#     house_url <- paste("https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/',i,'_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%20Ga%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.80297712207032%2C%22east%22%3A-82.99960187792969%2C%22south%22%3A33.72954602872359%2C%22north%22%3A34.18631216297461%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D",sep = "")
#   }
#   zillow_html <- read_html(house_url)
#   price <- c(price, price(zillow_html))
#   details <- c(details, details(zillow_html))
# }
# 
# # Clean
#Vector of price and details
# price <- c()
# details <- c()
# 
# #checking lengths
# length(price)
# 
# 
# library(stringr)
# # Create dataframe with 40 rows and 0 columns
# my_zillow_df <- data.frame(matrix(nrow = 80, ncol = 0))
# 
# # Then clean the file
# #**Do Some Cleaning**
# my_zillow_df2 <- house_df %>%
#   mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
#   mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
#   mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
#   mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
#   mutate(price = as.numeric(str_replace_all(price,"[^0-9]*","")))
# head(my_zillow_df2)
##Set
#dir.create("Data/")
#write.csv(my_zillow_df2, "Data/zillow.csv")

Scrape the results of your housing search

# Read the data sored in my directory
hw9_data <- read.csv("Data/zillow.csv")
head(hw9_data)
##   X bedrooms bathrooms sqft  price
## 1 1        3         2 1743 240000
## 2 2        4         3 1686 650000
## 3 3        6         4 4000 419900
## 4 4        3         3 1429 199900
## 5 5        5         5 3508 537000
## 6 6        3         2   NA  89000

Some plots that are informative.

I draw a 3D plot to see if the price of a home depends on the number of bathroom conditional on having same number of bedroom. But the 3D plot is hard to interpret in this case.

I draw a pairwise plot to check for correlation among all possible variables that I scraped.

library("scatterplot3d")
#colors <- colors[as.numeric(hw9_data$bathrooms)]
scatterplot3d(hw9_data$bedrooms, hw9_data$bathrooms, hw9_data$price, xlab="Bedrooms", ylab = "Bathrooms", zlab="Price")

#pairwise scatterplot 
pairs(hw9_data[, 2:5])

Regression analysis

Run a simple OLS command, as follows (with your variable names, of course)

library(stargazer)
ols_reg <-lm(price~bedrooms+bathrooms+sqft, hw9_data)
ols_reg1 <-lm(log(price)~bedrooms+bathrooms+sqft, hw9_data)
summary(ols_reg)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = hw9_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -279666  -93275  -16856   83917  321457 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 112271.19   62060.15   1.809   0.0746 .  
## bedrooms    -29746.72   21211.01  -1.402   0.1651    
## bathrooms    33689.79   19089.52   1.765   0.0818 .  
## sqft           138.90      23.07   6.022 6.62e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 138400 on 72 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.6334, Adjusted R-squared:  0.6182 
## F-statistic: 41.47 on 3 and 72 DF,  p-value: 1.12e-15
ols_reg$coefficients
## (Intercept)    bedrooms   bathrooms        sqft 
## 112271.1914 -29746.7230  33689.7884    138.9023
stargazer(ols_reg, ols_reg1,
           title="Regression",
           covariate.labels = c("Bedroom", "Bathroom", "Square Foot"),
           type="text",
           keep.stat=c("n", "rsq"), 
           column_labels=c("OLS", "Log Model"))
## 
## Regression
## =========================================
##                  Dependent variable:     
##              ----------------------------
##                   price       log(price) 
##                    (1)           (2)     
## -----------------------------------------
## Bedroom        -29,746.720      -0.062   
##               (21,211.010)     (0.050)   
##                                          
## Bathroom       33,689.790*      0.082*   
##               (19,089.520)     (0.045)   
##                                          
## Square Foot    138.902***     0.0003***  
##                 (23.066)       (0.0001)  
##                                          
## Constant      112,271.200*    12.061***  
##               (62,060.150)     (0.147)   
##                                          
## -----------------------------------------
## Observations       76             76     
## R2                0.633         0.636    
## =========================================
## Note:         *p<0.1; **p<0.05; ***p<0.01
## 
## Regression
## =============
## OLS Log Model
## -------------

On average, what is the value of having an additional bedroom in a home?

On average, the value of having an additional bedroom would decrease price by 29746 dollars. It is not significant at 5% level of significance. Bathroom increases the price of the house by 33689 dollars.

Please comment on those results, if you are so inclined.

Goodness of fit

The \(R^2\) is very high at more than 63.3% suggesting a very good fit of the model. 63.3 percent of variation is explained by the variables in the model.

It is just a cross section regression. If we had data over years can we could control for year fixed effects, or time trend and could have got better estimates.

Obviously there could be many things that may be be added to the regresion like population, zipcode dummy, ratings of schools around, crime in an area, etc. that could impact the property rates in an area. Other features of the house could be having a garden, yard, or a patio etc. that could impact the price of the house.

I would probably scrape the school district ratings from other websites that have elementary, or other schools in the area.