3 Do some cleaning

Extract the numbers of bathrooms and bedrooms, drop the missing valus and take logs for price and sqft

#String the bathrooms, bedrooms and square footage
zillow_df<-myData %>%
  #The bracket cannot follow with d, instead needs to have a space!!![\\d] wrong; [\\d ]correct
  mutate(bedrooms = as.integer(str_trim(str_extract(Detail,"[\\d ]*(?=bds)")))) %>%
  mutate(bathrooms =as.integer(str_trim(str_extract(Detail,"[\\d ]*(?=ba)"))))  %>%
  mutate(sqft =str_trim(str_extract(Detail,"[\\d , ]*(?=sqft)"))) %>%
  mutate(sqft = as.numeric(str_replace(sqft,",",""))) %>%
  mutate(price = as.numeric(str_replace_all(Price,"[^0-9]*","")))

#Clean the Data,rule out NAs
zillow_clean<-zillow_df %>% 
  filter(!is.na(bedrooms)) %>%
  filter(!is.na(bathrooms)) %>%
  filter(!is.na(sqft)) %>%
  filter(!is.na(price))

#Take log for price and sqft
zillow_clean$price1<-log(zillow_clean$price)
zillow_clean$sqft1<-log(zillow_clean$sqft)

4 Visualization

#Take log for all valuables
zillow_clean$price1<-log(zillow_clean$price)
zillow_clean$sqft1<-log(zillow_clean$sqft)
zillow_clean$bed1<-log(zillow_clean$bedrooms)
zillow_clean$bath1<-log(zillow_clean$bathrooms)


#Plot graphics
##Price and bedrooms (boxplot)
ggplot(zillow_clean,aes(x=bedrooms,y=price,fill=price,group=bedrooms))+
     geom_boxplot(size=1)+
     scale_x_continuous(breaks=seq(min(zillow_clean$bedrooms),max(zillow_clean$bedrooms),1))+
     scale_y_continuous(labels= dollar,limits=c(0,2500000),expand=c(0,0))+
     labs(title= "Bedrooms V.S. Selling Price", y="Price",x="Bedroom")+
      theme_classic()+
  theme(plot.title = element_text( face="bold",size=12, hjust=0.5))+
  theme(axis.title.y= element_text(face="bold",size=10,vjust=0.5,color="black"))+
  theme(axis.title.x= element_text(face="bold",size=10,vjust=0.5,color="black"))

##Price and bathrooms 
ggplot(zillow_clean,aes(x=bathrooms,y=price,fill=price,group=bathrooms))+
  geom_boxplot(size=1)+
  scale_x_continuous(breaks=seq(min(zillow_clean$bathrooms),max(zillow_clean$bathrooms),1))+
  scale_y_continuous(labels= dollar,limits=c(0,2500000),expand=c(0,0))+
  labs(title= "Bathrooms V.S. Selling Price", y="Price",x="Bathroom")+
  theme_classic()+
  theme(plot.title = element_text( face="bold",size=12, hjust=0.5))+
  theme(axis.title.y= element_text(face="bold",size=10,vjust=0.5,color="black"))+
  theme(axis.title.x= element_text(face="bold",size=10,vjust=0.5,color="black"))+
  theme(legend.position = "none")

##Sqrt and prices
##Make a color gradient for price in group mean (cheapest=black, most expansive=red)
zillow_adjust<-zillow_clean %>% 
      select(bedrooms, bathrooms, sqft, price) %>%
      group_by(bedrooms,bathrooms) %>%
      mutate(price,A=mean(price))
zillow_adjust$col<-ifelse(zillow_adjust$A>100000, ifelse(zillow_adjust$A>250000, ifelse(zillow_adjust$A>450000, ifelse(zillow_adjust$A>650000, ifelse(zillow_adjust$A>850000, ifelse(zillow_adjust$A>1000000, "Above 1,000,000","850,000~1,000,000"), "650,000~850,000"),"450,000~650,000"), "250,000~ 450,000"),"100,000~250,000"), "Below 100,000")
ggplot(zillow_adjust,aes(y=bedrooms,x=bathrooms))+
      geom_point(aes(size= sqft/100,col=col))+
  scale_y_continuous(breaks=seq(min(zillow_adjust$bedrooms),max(zillow_adjust$bedrooms),1))+
  scale_x_continuous(breaks=seq(min(zillow_adjust$bathrooms),max(zillow_adjust$bathrooms),1))+
  scale_color_discrete(name="Price")+
  labs(title= "Mean selling prices in Groups", x="Bathroom",y="Bedroom")+
  theme_classic()+
  theme(plot.title = element_text( face="bold",size=12, hjust=0.5))+
  theme(axis.title.y= element_text(face="bold",size=10,vjust=0.5,color="black"))+
  theme(axis.title.x= element_text(face="bold",size=10,vjust=0.5,color="black"))

5 Regression Estimation

5.1 Run a simple OLS command and goodness of fit

I make a basic model with OLS, and also provide estimations for log-log, log-level and internations among three variables.

5.2 Basic Model

5.2.1 Coefficient

model<-lm(price~bedrooms+bathrooms+sqft,zillow_clean)
model$coefficient
##  (Intercept)     bedrooms    bathrooms         sqft 
## -11711.49949 -21513.91666  95511.87617     65.02217
  • On average, the value of having an additional bedroom in a house is $-17040.8.
  • On average, the value of having an additional bathroom in a house is $99438.5, the coefficient is statistically significant.
  • On average, the value of having an additional square foot is $59.5, the eoefficient is statistically significant.

5.2.2 Goodness of fit

summary(model)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1410213   -80924   -24257    41231   962929 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11711.499  26780.024  -0.437   0.6620    
## bedrooms    -21513.917  10312.678  -2.086   0.0374 *  
## bathrooms    95511.876   8860.634  10.779   <2e-16 ***
## sqft            65.022      6.316  10.294   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 165800 on 555 degrees of freedom
## Multiple R-squared:  0.5532, Adjusted R-squared:  0.5508 
## F-statistic: 229.1 on 3 and 555 DF,  p-value: < 2.2e-16

5.3 log-log model

5.3.1 Coefficent

model1<-lm(price1 ~ bed1 + bath1 + sqft1, data = zillow_clean)
model1$coefficient
## (Intercept)        bed1       bath1       sqft1 
##   6.6321258  -0.2213583   0.4054621   0.7624110
summary(model1)
## 
## Call:
## lm(formula = price1 ~ bed1 + bath1 + sqft1, data = zillow_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0672 -0.2006 -0.0281  0.1552  1.3158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.63213    0.34723  19.100  < 2e-16 ***
## bed1        -0.22136    0.08442  -2.622  0.00898 ** 
## bath1        0.40546    0.05961   6.802 2.67e-11 ***
## sqft1        0.76241    0.05432  14.034  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3479 on 555 degrees of freedom
## Multiple R-squared:  0.6426, Adjusted R-squared:  0.6407 
## F-statistic: 332.7 on 3 and 555 DF,  p-value: < 2.2e-16

5.3.2 Goodness of fit

summary(model1)
## 
## Call:
## lm(formula = price1 ~ bed1 + bath1 + sqft1, data = zillow_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0672 -0.2006 -0.0281  0.1552  1.3158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.63213    0.34723  19.100  < 2e-16 ***
## bed1        -0.22136    0.08442  -2.622  0.00898 ** 
## bath1        0.40546    0.05961   6.802 2.67e-11 ***
## sqft1        0.76241    0.05432  14.034  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3479 on 555 degrees of freedom
## Multiple R-squared:  0.6426, Adjusted R-squared:  0.6407 
## F-statistic: 332.7 on 3 and 555 DF,  p-value: < 2.2e-16

5.4 log-level model1

5.4.1 Coefficients

model2<-lm(price1~bedrooms+bathrooms+sqft,zillow_clean)
model2$coefficient
##  (Intercept)     bedrooms    bathrooms         sqft 
## 1.156493e+01 2.419966e-02 2.406051e-01 9.744382e-05
summary(model2)    
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21891 -0.22075 -0.02506  0.18709  1.42966 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.156e+01  6.274e-02 184.324  < 2e-16 ***
## bedrooms    2.420e-02  2.416e-02   1.002    0.317    
## bathrooms   2.406e-01  2.076e-02  11.590  < 2e-16 ***
## sqft        9.744e-05  1.480e-05   6.585 1.06e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3885 on 555 degrees of freedom
## Multiple R-squared:  0.5544, Adjusted R-squared:  0.552 
## F-statistic: 230.2 on 3 and 555 DF,  p-value: < 2.2e-16

5.4.2 Goodnes of fit

summary(model2)    
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.21891 -0.22075 -0.02506  0.18709  1.42966 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.156e+01  6.274e-02 184.324  < 2e-16 ***
## bedrooms    2.420e-02  2.416e-02   1.002    0.317    
## bathrooms   2.406e-01  2.076e-02  11.590  < 2e-16 ***
## sqft        9.744e-05  1.480e-05   6.585 1.06e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3885 on 555 degrees of freedom
## Multiple R-squared:  0.5544, Adjusted R-squared:  0.552 
## F-statistic: 230.2 on 3 and 555 DF,  p-value: < 2.2e-16

5.5 log-level model2: take log of sqft

5.5.1 Coefficients

model3<-lm(price1~bedrooms+bathrooms+sqft1,zillow_clean)
model3$coefficient
## (Intercept)    bedrooms   bathrooms       sqft1 
##  6.63193321 -0.07166311  0.14794649  0.75587700
summary(model3) 
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02022 -0.19717 -0.02696  0.16672  1.33950 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.63193    0.34531  19.206  < 2e-16 ***
## bedrooms    -0.07166    0.02286  -3.136  0.00181 ** 
## bathrooms    0.14795    0.01977   7.484 2.85e-13 ***
## sqft1        0.75588    0.05298  14.266  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.345 on 555 degrees of freedom
## Multiple R-squared:  0.6485, Adjusted R-squared:  0.6466 
## F-statistic: 341.3 on 3 and 555 DF,  p-value: < 2.2e-16

5.5.2 Goodness of fit

summary(model3) 
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02022 -0.19717 -0.02696  0.16672  1.33950 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.63193    0.34531  19.206  < 2e-16 ***
## bedrooms    -0.07166    0.02286  -3.136  0.00181 ** 
## bathrooms    0.14795    0.01977   7.484 2.85e-13 ***
## sqft1        0.75588    0.05298  14.266  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.345 on 555 degrees of freedom
## Multiple R-squared:  0.6485, Adjusted R-squared:  0.6466 
## F-statistic: 341.3 on 3 and 555 DF,  p-value: < 2.2e-16

5.6 Interactions among three variables

5.6.1 log-level model1

model4<-lm(price1~bedrooms+bathrooms+sqft+bedrooms:bathrooms+bathrooms:sqft+bedrooms:sqft,zillow_clean)
model4$coefficient
##        (Intercept)           bedrooms          bathrooms               sqft 
##       1.093131e+01       2.379824e-01       2.791030e-01       2.543691e-04 
## bedrooms:bathrooms     bathrooms:sqft      bedrooms:sqft 
##      -2.399444e-02       2.032450e-05      -5.456394e-05
summary(model4) 
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft + bedrooms:bathrooms + 
##     bathrooms:sqft + bedrooms:sqft, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.26002 -0.23264 -0.04564  0.19361  1.40623 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.093e+01  1.399e-01  78.118  < 2e-16 ***
## bedrooms            2.380e-01  5.318e-02   4.475 9.29e-06 ***
## bathrooms           2.791e-01  7.318e-02   3.814 0.000152 ***
## sqft                2.544e-04  7.785e-05   3.267 0.001153 ** 
## bedrooms:bathrooms -2.399e-02  1.669e-02  -1.438 0.151008    
## bathrooms:sqft      2.032e-05  9.285e-06   2.189 0.029014 *  
## bedrooms:sqft      -5.456e-05  2.136e-05  -2.554 0.010902 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3802 on 552 degrees of freedom
## Multiple R-squared:  0.5754, Adjusted R-squared:  0.5707 
## F-statistic: 124.7 on 6 and 552 DF,  p-value: < 2.2e-16

5.6.2 log-level model2:take log of sqft

model5<-lm(price1~bedrooms+bathrooms+sqft1+bedrooms:bathrooms+bathrooms:sqft1+bedrooms:sqft1,zillow_clean)
model5$coefficient
##        (Intercept)           bedrooms          bathrooms              sqft1 
##        4.984699780        0.575853160       -0.046136387        0.972305926 
## bedrooms:bathrooms    bathrooms:sqft1     bedrooms:sqft1 
##        0.003270294        0.023902467       -0.084432509
summary(model5) 
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1 + bedrooms:bathrooms + 
##     bathrooms:sqft1 + bedrooms:sqft1, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92352 -0.19399 -0.03004  0.16405  1.32979 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.98470    1.16636   4.274 2.26e-05 ***
## bedrooms            0.57585    0.36730   1.568    0.118    
## bathrooms          -0.04614    0.23561  -0.196    0.845    
## sqft1               0.97231    0.17371   5.597 3.44e-08 ***
## bedrooms:bathrooms  0.00327    0.01627   0.201    0.841    
## bathrooms:sqft1     0.02390    0.03161   0.756    0.450    
## bedrooms:sqft1     -0.08443    0.05033  -1.678    0.094 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3446 on 552 degrees of freedom
## Multiple R-squared:  0.6512, Adjusted R-squared:  0.6474 
## F-statistic: 171.8 on 6 and 552 DF,  p-value: < 2.2e-16

6 Discussion

The value of R-squared in each model varies around 60%, this indicates that these models can be fitted into linear regressions( but not well enough to be fit into linear regressions). From models with interactions, we can figure out the statistically insignificance of their corresponding coefficients, therefore we do not need to consider the effects of interactions. Interestingly, these models show a negative sign for the coefficients of bedrooms while a positive sign for bathrooms. This implies that housing prices will increases with more bathrooms. I think it might be caused by the demand for housing rent market at Athens. Athens is a university town, and plenty of students choose to a off-campus apartment and share it with other roommates. The house with more bathrooms are preferred by renters since they are more likely have their own bathrooms.

However, the goodness of fit only tells how well the data is “suitable” for a linear regression. We can do other tests, such as test for normality, linearity, no autocorrelation and so on, to check whether the assumptions for CLRM are satisisfied. In addition, we can pick up the outliers. Here I compare the basic model with the model with highest R-squared value. For example, we can

  • Test for CLRM assumptions
library(gvlma)
gvmodel<-gvlma(model)
summary(gvmodel)
## 
## Call:
## lm(formula = price ~ bedrooms + bathrooms + sqft, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1410213   -80924   -24257    41231   962929 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -11711.499  26780.024  -0.437   0.6620    
## bedrooms    -21513.917  10312.678  -2.086   0.0374 *  
## bathrooms    95511.876   8860.634  10.779   <2e-16 ***
## sqft            65.022      6.316  10.294   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 165800 on 555 degrees of freedom
## Multiple R-squared:  0.5532, Adjusted R-squared:  0.5508 
## F-statistic: 229.1 on 3 and 555 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model) 
## 
##                        Value p-value                   Decision
## Global Stat        6900.5862  0.0000 Assumptions NOT satisfied!
## Skewness             86.3204  0.0000 Assumptions NOT satisfied!
## Kurtosis           6813.3676  0.0000 Assumptions NOT satisfied!
## Link Function         0.2214  0.6380    Assumptions acceptable.
## Heteroscedasticity    0.6769  0.4107    Assumptions acceptable.
gvmodel3<-gvlma(model3)
summary(gvmodel3)
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02022 -0.19717 -0.02696  0.16672  1.33950 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.63193    0.34531  19.206  < 2e-16 ***
## bedrooms    -0.07166    0.02286  -3.136  0.00181 ** 
## bathrooms    0.14795    0.01977   7.484 2.85e-13 ***
## sqft1        0.75588    0.05298  14.266  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.345 on 555 degrees of freedom
## Multiple R-squared:  0.6485, Adjusted R-squared:  0.6466 
## F-statistic: 341.3 on 3 and 555 DF,  p-value: < 2.2e-16
## 
## 
## ASSESSMENT OF THE LINEAR MODEL ASSUMPTIONS
## USING THE GLOBAL TEST ON 4 DEGREES-OF-FREEDOM:
## Level of Significance =  0.05 
## 
## Call:
##  gvlma(x = model3) 
## 
##                       Value p-value                   Decision
## Global Stat        287.8161 0.00000 Assumptions NOT satisfied!
## Skewness             0.4645 0.49551    Assumptions acceptable.
## Kurtosis           276.6103 0.00000 Assumptions NOT satisfied!
## Link Function        6.3729 0.01159 Assumptions NOT satisfied!
## Heteroscedasticity   4.3684 0.03661 Assumptions NOT satisfied!
  • Test for outliers
outlierTest(model)
##       rstudent unadjusted p-value Bonferroni p
## 366 -15.486560         3.2754e-45   1.8309e-42
## 41    6.229544         9.2608e-10   5.1768e-07
## 342   6.073004         2.3331e-09   1.3042e-06
## 159   5.918241         5.7059e-09   3.1896e-06
## 506   5.258434         2.0782e-07   1.1617e-04
## 117   5.003496         7.5720e-07   4.2327e-04
## 498   4.442651         1.0733e-05   5.9999e-03
outlierTest(model3)
##      rstudent unadjusted p-value Bonferroni p
## 366 -6.488703         1.9227e-10   1.0748e-07
## 139 -4.403827         1.2768e-05   7.1375e-03

Let’s get rid of the outliers and the goodness of fit for model 3 increases to 66.4%.

new_model3<-lm(price1~bedrooms+bathrooms+sqft1,zillow_clean[-c(304,86),])
new_model3$coefficient
## (Intercept)    bedrooms   bathrooms       sqft1 
##   6.6247418  -0.0712536   0.1472797   0.7570670
summary(new_model3)
## 
## Call:
## lm(formula = price1 ~ bedrooms + bathrooms + sqft1, data = zillow_clean[-c(304, 
##     86), ])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0246 -0.1964 -0.0280  0.1655  1.3383 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.62474    0.34623  19.134  < 2e-16 ***
## bedrooms    -0.07125    0.02288  -3.114  0.00194 ** 
## bathrooms    0.14728    0.01985   7.420 4.44e-13 ***
## sqft1        0.75707    0.05309  14.260  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3448 on 553 degrees of freedom
## Multiple R-squared:  0.6493, Adjusted R-squared:  0.6474 
## F-statistic: 341.3 on 3 and 553 DF,  p-value: < 2.2e-16

Hedonic pricing model is to identify price factors both by internal characteristics of the good being sold and external factors affecting it. The numbers of bedrooms, bathrooms and square feet can be regarded as internal factors for housing prices. Also, there are external factors to affect the value of houses: green coverage, education resources,* parking, doorways* and priviate open space.* The more data we can collect for the measurement of these external factors, the better we can improve our housing price model. For example, we can find the data for tree coverage at Athens see https://athensclarkecounty.com/DocumentCenter/View/377/Map45_treecover?bidId=, and we can use this tree coverage map and the address of houses to investigate whether the tree coverage will affect the value of a house.