1 Run a housing search
1.1 Necessary packages and functions
library(rvest)
library(dplyr)
library(tidyverse)
library(stringr)
library(ggplot2)
library(scales)
library(xml2)
library(car)
get_price<-function(html){
html %>%
html_nodes('.list-card-price') %>%
html_text() %>%
str_trim()
}
get_detail<-function(html){
html %>%
html_nodes('.list-card-details') %>%
html_text() %>%
str_trim()
}
get_address<-function(html){
html %>%
html_nodes('.list-card-addr') %>%
html_text() %>%
str_trim()
}2 Scrape the results of your housing search
2.1 Scape results from website
I used Jonathan Katz’s codes but with an improvement to avoid the effects of NA
url<-"https://www.zillow.com/homes/for_sale/house_type/1-_beds/"
website<- read_html(url)
more.urls <- website %>%
html_nodes(".zsg-pagination a") %>%
html_attr("href")
more.urls <- na.omit(unique(more.urls))
more.urls <- strsplit(more.urls, '/')
more.urls <- t(lapply(more.urls, function(x) x[length(x)]))
more.urls[1,1]<-"1_p"
more.urls <- as.numeric(gsub('[_p]','',more.urls))
more.urls <- max(more.urls)
#Make a list for the url address
url_page<-list()
for (i in 1:more.urls){
if (i==1){
url_page[i]<-"https://www.zillow.com/homes/for_sale/house_type/1-_beds/?searchQueryState={%22pagination%22:{},%22mapBounds%22:{%22west%22:-83.55099531901479,%22east%22:-83.25917097819447,%22south%22:33.83215645713838,%22north%22:34.01789704192027},%22mapZoom%22:12,%22isMapVisible%22:true,%22filterState%22:{%22price%22:{%22min%22:0},%22monthlyPayment%22:{%22min%22:0},%22beds%22:{%22min%22:1},%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}"
}
else{
url_page[i]<-paste0("https://www.zillow.com/homes/for_sale/house_type/1-_beds/",i, "_p/?searchQueryState={%22pagination%22:{%22currentPage%22:",i,"},%22mapBounds%22:{%22west%22:-83.55099531901479,%22east%22:-83.25917097819447,%22south%22:33.83215645713838,%22north%22:34.01789704192027},%22mapZoom%22:12,%22isMapVisible%22:true,%22filterState%22:{%22price%22:{%22min%22:0},%22monthlyPayment%22:{%22min%22:0},%22beds%22:{%22min%22:1},%22sortSelection%22:{%22value%22:%22globalrelevanceex%22},%22isManufactured%22:{%22value%22:false},%22isLotLand%22:{%22value%22:false},%22isTownhouse%22:{%22value%22:false}},%22isListVisible%22:true}")
}
}
#Convert list to vector
scraped_data1<-as.vector(unlist(url_page))2.2 Extract prices and details of the house
p<-c()
d<-c()
a<-c()
pV<-c()
dV<-c()
aV<-c()
#Need to keep enough rows (pv<-c()), otherwise the new results from loop will cover the previous one
for (i in seq_along(scraped_data1)){
htmlpage<-read_html(scraped_data1[i])
price<-get_price(htmlpage)
p<-price
pV<-c(pV,p)
detail<-get_detail(htmlpage)
d<-detail
dV<-c(dV,d)
address<-get_address(htmlpage)
a<-address
aV<-c(aV,a)
}## Error in open.connection(x, "rb"): Maximum (10) redirects followed
myData<-cbind(Price=pV,Detail=dV,Address=aV)
myData<-as.data.frame(myData)3 Do some cleaning
Extract the numbers of bathrooms and bedrooms, drop the missing valus and take logs for price and sqft
#String the bathrooms, bedrooms and square footage
zillow_df<-myData %>%
#The bracket cannot follow with d, instead needs to have a space!!![\\d] wrong; [\\d ]correct
mutate(bedrooms = as.integer(str_trim(str_extract(Detail,"[\\d ]*(?=bds)")))) %>%
mutate(bathrooms =as.integer(str_trim(str_extract(Detail,"[\\d ]*(?=ba)")))) %>%
mutate(sqft =str_trim(str_extract(Detail,"[\\d , ]*(?=sqft)"))) %>%
mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
mutate(price = as.numeric(str_replace_all(Price,"[^0-9]*","")))
#Clean the Data,rule out NAs
zillow_clean<-zillow_df %>%
filter(!is.na(bedrooms)) %>%
filter(!is.na(bathrooms)) %>%
filter(!is.na(sqft)) %>%
filter(!is.na(price))
#Take log for price and sqft
zillow_clean$price1<-log(zillow_clean$price)
zillow_clean$sqft1<-log(zillow_clean$sqft)4 Visualization
#Take log for all valuables
zillow_clean$price1<-log(zillow_clean$price)
zillow_clean$sqft1<-log(zillow_clean$sqft)
zillow_clean$bed1<-log(zillow_clean$bedrooms)
zillow_clean$bath1<-log(zillow_clean$bathrooms)
#Plot graphics
##Price and bedrooms (boxplot)
ggplot(zillow_clean,aes(x=bedrooms,y=price,fill=price,group=bedrooms))+
geom_boxplot(size=1)+
scale_x_continuous(breaks=seq(min(zillow_clean$bedrooms),max(zillow_clean$bedrooms),1))+
scale_y_continuous(labels= dollar,limits=c(0,2500000),expand=c(0,0))+
labs(title= "Bedrooms V.S. Selling Price", y="Price",x="Bedroom")+
theme_classic()+
theme(plot.title = element_text( face="bold",size=12, hjust=0.5))+
theme(axis.title.y= element_text(face="bold",size=10,vjust=0.5,color="black"))+
theme(axis.title.x= element_text(face="bold",size=10,vjust=0.5,color="black"))##Price and bathrooms
ggplot(zillow_clean,aes(x=bathrooms,y=price,fill=price,group=bathrooms))+
geom_boxplot(size=1)+
scale_x_continuous(breaks=seq(min(zillow_clean$bathrooms),max(zillow_clean$bathrooms),1))+
scale_y_continuous(labels= dollar,limits=c(0,2500000),expand=c(0,0))+
labs(title= "Bathrooms V.S. Selling Price", y="Price",x="Bathroom")+
theme_classic()+
theme(plot.title = element_text( face="bold",size=12, hjust=0.5))+
theme(axis.title.y= element_text(face="bold",size=10,vjust=0.5,color="black"))+
theme(axis.title.x= element_text(face="bold",size=10,vjust=0.5,color="black"))+
theme(legend.position = "none")##Sqrt and prices
##Make a color gradient for price in group mean (cheapest=black, most expansive=red)
zillow_adjust<-zillow_clean %>%
select(bedrooms, bathrooms, sqft, price) %>%
group_by(bedrooms,bathrooms) %>%
mutate(price,A=mean(price))
zillow_adjust$col<-ifelse(zillow_adjust$A>100000, ifelse(zillow_adjust$A>250000, ifelse(zillow_adjust$A>450000, ifelse(zillow_adjust$A>650000, ifelse(zillow_adjust$A>850000, ifelse(zillow_adjust$A>1000000, "Above 1,000,000","850,000~1,000,000"), "650,000~850,000"),"450,000~650,000"), "250,000~ 450,000"),"100,000~250,000"), "Below 100,000")
ggplot(zillow_adjust,aes(y=bedrooms,x=bathrooms))+
geom_point(aes(size= sqft/100,col=col))+
scale_y_continuous(breaks=seq(min(zillow_adjust$bedrooms),max(zillow_adjust$bedrooms),1))+
scale_x_continuous(breaks=seq(min(zillow_adjust$bathrooms),max(zillow_adjust$bathrooms),1))+
scale_color_discrete(name="Price")+
labs(title= "Mean selling prices in Groups", x="Bathroom",y="Bedroom")+
theme_classic()+
theme(plot.title = element_text( face="bold",size=12, hjust=0.5))+
theme(axis.title.y= element_text(face="bold",size=10,vjust=0.5,color="black"))+
theme(axis.title.x= element_text(face="bold",size=10,vjust=0.5,color="black"))5 Regression Estimation
5.1 Run a simple OLS command and goodness of fit
I make a basic model with OLS, and also provide estimations for log-log, log-level and internations among three variables.
5.2 Basic Model
5.2.1 Coefficient
model<-lm(price~bedrooms+bathrooms+sqft,zillow_clean)
model$coefficient## (Intercept) bedrooms bathrooms sqft
## -11711.49949 -21513.91666 95511.87617 65.02217
- On average, the value of having an additional bedroom in a house is $-17040.8.
- On average, the value of having an additional bathroom in a house is $99438.5, the coefficient is statistically significant.
- On average, the value of having an additional square foot is $59.5, the eoefficient is statistically significant.
5.2.2 Goodness of fit
summary(model)##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1410213 -80924 -24257 41231 962929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11711.499 26780.024 -0.437 0.6620
## bedrooms -21513.917 10312.678 -2.086 0.0374 *
## bathrooms 95511.876 8860.634 10.779 <2e-16 ***
## sqft 65.022 6.316 10.294 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 165800 on 555 degrees of freedom
## Multiple R-squared: 0.5532, Adjusted R-squared: 0.5508
## F-statistic: 229.1 on 3 and 555 DF, p-value: < 2.2e-16
5.3 log-log model
5.3.1 Coefficent
model1<-lm(price1 ~ bed1 + bath1 + sqft1, data = zillow_clean)
model1$coefficient## (Intercept) bed1 bath1 sqft1
## 6.6321258 -0.2213583 0.4054621 0.7624110
summary(model1)##
## Call:
## lm(formula = price1 ~ bed1 + bath1 + sqft1, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0672 -0.2006 -0.0281 0.1552 1.3158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.63213 0.34723 19.100 < 2e-16 ***
## bed1 -0.22136 0.08442 -2.622 0.00898 **
## bath1 0.40546 0.05961 6.802 2.67e-11 ***
## sqft1 0.76241 0.05432 14.034 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3479 on 555 degrees of freedom
## Multiple R-squared: 0.6426, Adjusted R-squared: 0.6407
## F-statistic: 332.7 on 3 and 555 DF, p-value: < 2.2e-16
5.3.2 Goodness of fit
summary(model1)##
## Call:
## lm(formula = price1 ~ bed1 + bath1 + sqft1, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0672 -0.2006 -0.0281 0.1552 1.3158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.63213 0.34723 19.100 < 2e-16 ***
## bed1 -0.22136 0.08442 -2.622 0.00898 **
## bath1 0.40546 0.05961 6.802 2.67e-11 ***
## sqft1 0.76241 0.05432 14.034 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3479 on 555 degrees of freedom
## Multiple R-squared: 0.6426, Adjusted R-squared: 0.6407
## F-statistic: 332.7 on 3 and 555 DF, p-value: < 2.2e-16
5.4 log-level model1
5.4.1 Coefficients
model2<-lm(price1~bedrooms+bathrooms+sqft,zillow_clean)
model2$coefficient## (Intercept) bedrooms bathrooms sqft
## 1.156493e+01 2.419966e-02 2.406051e-01 9.744382e-05
summary(model2) ##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21891 -0.22075 -0.02506 0.18709 1.42966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.156e+01 6.274e-02 184.324 < 2e-16 ***
## bedrooms 2.420e-02 2.416e-02 1.002 0.317
## bathrooms 2.406e-01 2.076e-02 11.590 < 2e-16 ***
## sqft 9.744e-05 1.480e-05 6.585 1.06e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3885 on 555 degrees of freedom
## Multiple R-squared: 0.5544, Adjusted R-squared: 0.552
## F-statistic: 230.2 on 3 and 555 DF, p-value: < 2.2e-16
5.4.2 Goodnes of fit
summary(model2) ##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.21891 -0.22075 -0.02506 0.18709 1.42966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.156e+01 6.274e-02 184.324 < 2e-16 ***
## bedrooms 2.420e-02 2.416e-02 1.002 0.317
## bathrooms 2.406e-01 2.076e-02 11.590 < 2e-16 ***
## sqft 9.744e-05 1.480e-05 6.585 1.06e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3885 on 555 degrees of freedom
## Multiple R-squared: 0.5544, Adjusted R-squared: 0.552
## F-statistic: 230.2 on 3 and 555 DF, p-value: < 2.2e-16
5.5 log-level model2: take log of sqft
5.5.1 Coefficients
model3<-lm(price1~bedrooms+bathrooms+sqft1,zillow_clean)
model3$coefficient## (Intercept) bedrooms bathrooms sqft1
## 6.63193321 -0.07166311 0.14794649 0.75587700
summary(model3) ##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02022 -0.19717 -0.02696 0.16672 1.33950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.63193 0.34531 19.206 < 2e-16 ***
## bedrooms -0.07166 0.02286 -3.136 0.00181 **
## bathrooms 0.14795 0.01977 7.484 2.85e-13 ***
## sqft1 0.75588 0.05298 14.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.345 on 555 degrees of freedom
## Multiple R-squared: 0.6485, Adjusted R-squared: 0.6466
## F-statistic: 341.3 on 3 and 555 DF, p-value: < 2.2e-16
5.5.2 Goodness of fit
summary(model3) ##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02022 -0.19717 -0.02696 0.16672 1.33950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.63193 0.34531 19.206 < 2e-16 ***
## bedrooms -0.07166 0.02286 -3.136 0.00181 **
## bathrooms 0.14795 0.01977 7.484 2.85e-13 ***
## sqft1 0.75588 0.05298 14.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.345 on 555 degrees of freedom
## Multiple R-squared: 0.6485, Adjusted R-squared: 0.6466
## F-statistic: 341.3 on 3 and 555 DF, p-value: < 2.2e-16
5.6 Interactions among three variables
5.6.1 log-level model1
model4<-lm(price1~bedrooms+bathrooms+sqft+bedrooms:bathrooms+bathrooms:sqft+bedrooms:sqft,zillow_clean)
model4$coefficient## (Intercept) bedrooms bathrooms sqft
## 1.093131e+01 2.379824e-01 2.791030e-01 2.543691e-04
## bedrooms:bathrooms bathrooms:sqft bedrooms:sqft
## -2.399444e-02 2.032450e-05 -5.456394e-05
summary(model4) ##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft + bedrooms:bathrooms +
## bathrooms:sqft + bedrooms:sqft, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.26002 -0.23264 -0.04564 0.19361 1.40623
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.093e+01 1.399e-01 78.118 < 2e-16 ***
## bedrooms 2.380e-01 5.318e-02 4.475 9.29e-06 ***
## bathrooms 2.791e-01 7.318e-02 3.814 0.000152 ***
## sqft 2.544e-04 7.785e-05 3.267 0.001153 **
## bedrooms:bathrooms -2.399e-02 1.669e-02 -1.438 0.151008
## bathrooms:sqft 2.032e-05 9.285e-06 2.189 0.029014 *
## bedrooms:sqft -5.456e-05 2.136e-05 -2.554 0.010902 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3802 on 552 degrees of freedom
## Multiple R-squared: 0.5754, Adjusted R-squared: 0.5707
## F-statistic: 124.7 on 6 and 552 DF, p-value: < 2.2e-16
5.6.2 log-level model2:take log of sqft
model5<-lm(price1~bedrooms+bathrooms+sqft1+bedrooms:bathrooms+bathrooms:sqft1+bedrooms:sqft1,zillow_clean)
model5$coefficient## (Intercept) bedrooms bathrooms sqft1
## 4.984699780 0.575853160 -0.046136387 0.972305926
## bedrooms:bathrooms bathrooms:sqft1 bedrooms:sqft1
## 0.003270294 0.023902467 -0.084432509
summary(model5) ##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1 + bedrooms:bathrooms +
## bathrooms:sqft1 + bedrooms:sqft1, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.92352 -0.19399 -0.03004 0.16405 1.32979
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.98470 1.16636 4.274 2.26e-05 ***
## bedrooms 0.57585 0.36730 1.568 0.118
## bathrooms -0.04614 0.23561 -0.196 0.845
## sqft1 0.97231 0.17371 5.597 3.44e-08 ***
## bedrooms:bathrooms 0.00327 0.01627 0.201 0.841
## bathrooms:sqft1 0.02390 0.03161 0.756 0.450
## bedrooms:sqft1 -0.08443 0.05033 -1.678 0.094 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3446 on 552 degrees of freedom
## Multiple R-squared: 0.6512, Adjusted R-squared: 0.6474
## F-statistic: 171.8 on 6 and 552 DF, p-value: < 2.2e-16
6 Discussion
The value of R-squared in each model varies around 60%, this indicates that these models can be fitted into linear regressions( but not well enough to be fit into linear regressions). From models with interactions, we can figure out the statistically insignificance of their corresponding coefficients, therefore we do not need to consider the effects of interactions. Interestingly, these models show a negative sign for the coefficients of bedrooms while a positive sign for bathrooms. This implies that housing prices will increases with more bathrooms. I think it might be caused by the demand for housing rent market at Athens. Athens is a university town, and plenty of students choose to a off-campus apartment and share it with other roommates. The house with more bathrooms are preferred by renters since they are more likely have their own bathrooms.
However, the goodness of fit only tells how well the data is “suitable” for a linear regression. We can do other tests, such as test for normality, linearity, no autocorrelation and so on, to check whether the assumptions for CLRM are satisisfied. In addition, we can pick up the outliers. Here I compare the basic model with the model with highest R-squared value. For example, we can
- Test for CLRM assumptions
library(gvlma)
gvmodel<-gvlma(model)
summary(gvmodel)##
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1410213 -80924 -24257 41231 962929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11711.499 26780.024 -0.437 0.6620
## bedrooms -21513.917 10312.678 -2.086 0.0374 *
## bathrooms 95511.876 8860.634 10.779 <2e-16 ***
## sqft 65.022 6.316 10.294 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 165800 on 555 degrees of freedom
## Multiple R-squared: 0.5532, Adjusted R-squared: 0.5508
## F-statistic: 229.1 on 3 and 555 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model)
##
## Value p-value Decision
## Global Stat 6900.5862 0.0000 Assumptions NOT satisfied!
## Skewness 86.3204 0.0000 Assumptions NOT satisfied!
## Kurtosis 6813.3676 0.0000 Assumptions NOT satisfied!
## Link Function 0.2214 0.6380 Assumptions acceptable.
## Heteroscedasticity 0.6769 0.4107 Assumptions acceptable.
gvmodel3<-gvlma(model3)
summary(gvmodel3)##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02022 -0.19717 -0.02696 0.16672 1.33950
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.63193 0.34531 19.206 < 2e-16 ***
## bedrooms -0.07166 0.02286 -3.136 0.00181 **
## bathrooms 0.14795 0.01977 7.484 2.85e-13 ***
## sqft1 0.75588 0.05298 14.266 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.345 on 555 degrees of freedom
## Multiple R-squared: 0.6485, Adjusted R-squared: 0.6466
## F-statistic: 341.3 on 3 and 555 DF, p-value: < 2.2e-16
##
##
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance = 0.05
##
## Call:
## gvlma(x = model3)
##
## Value p-value Decision
## Global Stat 287.8161 0.00000 Assumptions NOT satisfied!
## Skewness 0.4645 0.49551 Assumptions acceptable.
## Kurtosis 276.6103 0.00000 Assumptions NOT satisfied!
## Link Function 6.3729 0.01159 Assumptions NOT satisfied!
## Heteroscedasticity 4.3684 0.03661 Assumptions NOT satisfied!
- Test for outliers
outlierTest(model)## rstudent unadjusted p-value Bonferroni p
## 366 -15.486560 3.2754e-45 1.8309e-42
## 41 6.229544 9.2608e-10 5.1768e-07
## 342 6.073004 2.3331e-09 1.3042e-06
## 159 5.918241 5.7059e-09 3.1896e-06
## 506 5.258434 2.0782e-07 1.1617e-04
## 117 5.003496 7.5720e-07 4.2327e-04
## 498 4.442651 1.0733e-05 5.9999e-03
outlierTest(model3)## rstudent unadjusted p-value Bonferroni p
## 366 -6.488703 1.9227e-10 1.0748e-07
## 139 -4.403827 1.2768e-05 7.1375e-03
Let’s get rid of the outliers and the goodness of fit for model 3 increases to 66.4%.
new_model3<-lm(price1~bedrooms+bathrooms+sqft1,zillow_clean[-c(304,86),])
new_model3$coefficient## (Intercept) bedrooms bathrooms sqft1
## 6.6247418 -0.0712536 0.1472797 0.7570670
summary(new_model3)##
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean[-c(304,
## 86), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.0246 -0.1964 -0.0280 0.1655 1.3383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.62474 0.34623 19.134 < 2e-16 ***
## bedrooms -0.07125 0.02288 -3.114 0.00194 **
## bathrooms 0.14728 0.01985 7.420 4.44e-13 ***
## sqft1 0.75707 0.05309 14.260 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3448 on 553 degrees of freedom
## Multiple R-squared: 0.6493, Adjusted R-squared: 0.6474
## F-statistic: 341.3 on 3 and 553 DF, p-value: < 2.2e-16
Hedonic pricing model is to identify price factors both by internal characteristics of the good being sold and external factors affecting it. The numbers of bedrooms, bathrooms and square feet can be regarded as internal factors for housing prices. Also, there are external factors to affect the value of houses: green coverage, education resources,* parking, doorways* and priviate open space.* The more data we can collect for the measurement of these external factors, the better we can improve our housing price model. For example, we can find the data for tree coverage at Athens see https://athensclarkecounty.com/DocumentCenter/View/377/Map45_treecover?bidId=, and we can use this tree coverage map and the address of houses to investigate whether the tree coverage will affect the value of a house.