I downloaded the data and saved it as a CSV. I will use the cleaned CSV to do my homework.
#load packages
# rm(list = ls(all.names = TRUE))
# necessaryPackages <-c("lattice", "ggplot2", "stargazer", "did", "tree", "raster", "devtools", "purr", "dplyr", #"tidyverse", "magrittr", "hrbrthemes", "MatchIt", "glmnet", "rvest")
# new.packages <- necessaryPackages[!(necessaryPackages%in% installed.packages()[,"Package"])]
# if(length(new.packages))
# install.packages(new.packages, repos = "http://cran.us.r-project.org")
# lapply(necessaryPackages, require, character.only = TRUE)
# ## Create function for price
# price <- function(html){
# html%>%
# #the relevant tag
# html_nodes('.list-card-price')%>%
# html_text()%>%
# #trimming
# str_trim()}
##
# Create function for other details
# Details of the house (bathrooms, bedrooms, square footage, anything else you want)
# details <- function(html){
# html%>%
# #the relevant tag
# html_nodes('.list-card-details')%>%
# html_text()%>%
# #trimming
# str_trim()}
#loop over the four pages
# for (i in 1:4){
# if (i==1){
# house_url<-"https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/?searchQueryState=%7B%22pagination%22%3A%7B%7D%2C%22usersSearchTerm%22%3A%22Athens%20Ga%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.80297712207032%2C%22east%22%3A-82.99960187792969%2C%22south%22%3A33.72954602872359%2C%22north%22%3A34.18631216297461%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%7D"}
# else{
# house_url <- paste("https://www.zillow.com/athens-ga/3-_beds/1.0-_baths/',i,'_p/?searchQueryState=%7B%22usersSearchTerm%22%3A%22Athens%20Ga%22%2C%22mapBounds%22%3A%7B%22west%22%3A-83.80297712207032%2C%22east%22%3A-82.99960187792969%2C%22south%22%3A33.72954602872359%2C%22north%22%3A34.18631216297461%7D%2C%22regionSelection%22%3A%5B%7B%22regionId%22%3A23534%2C%22regionType%22%3A6%7D%5D%2C%22isMapVisible%22%3Atrue%2C%22filterState%22%3A%7B%22beds%22%3A%7B%22min%22%3A3%7D%2C%22baths%22%3A%7B%22min%22%3A1%7D%2C%22sort%22%3A%7B%22value%22%3A%22globalrelevanceex%22%7D%2C%22ah%22%3A%7B%22value%22%3Atrue%7D%7D%2C%22isListVisible%22%3Atrue%2C%22mapZoom%22%3A11%2C%22pagination%22%3A%7B%22currentPage%22%3A2%7D%7D",sep = "")
# }
# zillow_html <- read_html(house_url)
# price <- c(price, price(zillow_html))
# details <- c(details, details(zillow_html))
# }
#
# # Clean
#Vector of price and details
# price <- c()
# details <- c()
#
# #checking lengths
# length(price)
#
#
# library(stringr)
# # Create dataframe with 40 rows and 0 columns
# my_zillow_df <- data.frame(matrix(nrow = 80, ncol = 0))
#
# # Then clean the file
# #**Do Some Cleaning**
# my_zillow_df2 <- house_df %>%
# mutate(bedrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=bds)")))) %>%
# mutate(bathrooms = as.integer(str_trim(str_extract(details, "[\\d ]*(?=ba)")))) %>%
# mutate(sqft = str_trim(str_extract(details, "[\\d ,]*(?=sqft)"))) %>%
# mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
# mutate(price = as.numeric(str_replace_all(price,"[^0-9]*","")))
# head(my_zillow_df2)
##Set
#dir.create("Data/")
#write.csv(my_zillow_df2, "Data/zillow.csv")
Scrape the results of your housing search
# Read the data sored in my directory
hw9_data <- read.csv("Data/zillow.csv")
head(hw9_data)
## X bedrooms bathrooms sqft price
## 1 1 3 2 1743 240000
## 2 2 4 3 1686 650000
## 3 3 6 4 4000 419900
## 4 4 3 3 1429 199900
## 5 5 5 5 3508 537000
## 6 6 3 2 NA 89000
library("scatterplot3d")
#colors <- colors[as.numeric(hw9_data$bathrooms)]
scatterplot3d(hw9_data$bedrooms, hw9_data$bathrooms, hw9_data$price, xlab="Bedrooms", ylab = "Bathrooms", zlab="Price")
#pairwise scatterplot
pairs(hw9_data[, 2:5])
Regression analysis
Run a simple OLS command, as follows (with your variable names, of course)
library(stargazer)
ols_reg <-lm(price~bedrooms+bathrooms+sqft, hw9_data)
ols_reg1 <-lm(log(price)~bedrooms+bathrooms+sqft, hw9_data)
summary(ols_reg)
##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = hw9_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -279666 -93275 -16856 83917 321457
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 112271.19 62060.15 1.809 0.0746 .
## bedrooms -29746.72 21211.01 -1.402 0.1651
## bathrooms 33689.79 19089.52 1.765 0.0818 .
## sqft 138.90 23.07 6.022 6.62e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 138400 on 72 degrees of freedom
## (4 observations deleted due to missingness)
## Multiple R-squared: 0.6334, Adjusted R-squared: 0.6182
## F-statistic: 41.47 on 3 and 72 DF, p-value: 1.12e-15
ols_reg$coefficients
## (Intercept) bedrooms bathrooms sqft
## 112271.1914 -29746.7230 33689.7884 138.9023
stargazer(ols_reg, ols_reg1,
title="Regression",
covariate.labels = c("Bedroom", "Bathroom", "Square Foot"),
type="text",
keep.stat=c("n", "rsq"),
column_labels=c("OLS", "Log Model"))
##
## Regression
## =========================================
## Dependent variable:
## ----------------------------
## price log(price)
## (1) (2)
## -----------------------------------------
## Bedroom -29,746.720 -0.062
## (21,211.010) (0.050)
##
## Bathroom 33,689.790* 0.082*
## (19,089.520) (0.045)
##
## Square Foot 138.902*** 0.0003***
## (23.066) (0.0001)
##
## Constant 112,271.200* 12.061***
## (62,060.150) (0.147)
##
## -----------------------------------------
## Observations 76 76
## R2 0.633 0.636
## =========================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## Regression
## =============
## OLS Log Model
## -------------
On average, what is the value of having an additional bedroom in a home?
Please comment on those results, if you are so inclined.
Goodness of fit