Install packages necessary:

knitr::opts_chunk$set(echo = TRUE)
library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(sandwich)
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
Davis2018 <- read_dta("~/Downloads/Davis2018.dta")
Davis2018$ClosingYear <- substr(Davis2018$ClosingDate, 1, 4)
Davis2018$ClosingYear <- as.numeric(Davis2018$ClosingYear)

Davis2018$ClosingMonth <- substr(Davis2018$ClosingDate, 6, 7)
Davis2018$ClosingMonth <- as.numeric(Davis2018$ClosingDate) 
sample2018 <- Davis2018 %>% 
  filter(ClosingYear == 2018)
fig.dist <- split(sample2018$SalePrice, sample2018$Bedroom)
fig.mean <- sapply(fig.dist, mean, na.rm = T)
houseplot <- barplot(fig.mean, xlab = 'Bedrooms', ylab = 'Average Sale Price')

reg <- lm(Davis2018$SalePrice~Davis2018$ClosingMonth)
summary(reg)
## 
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ClosingMonth)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -394875 -183880  -14545  117956 1021005 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)
## (Intercept)            1673907.77 2077479.46   0.806    0.421
## Davis2018$ClosingMonth     -55.53     117.07  -0.474    0.636
## 
## Residual standard error: 227500 on 297 degrees of freedom
## Multiple R-squared:  0.0007569,  Adjusted R-squared:  -0.002608 
## F-statistic: 0.225 on 1 and 297 DF,  p-value: 0.6356

With an F statistic of .225 leading to a p-value of .6356, we find the overall regression is not significant at the .05 level.

coeftest(lm(Davis2018$SalePrice~Davis2018$ClosingMonth), vcov=sandwich)
## 
## t test of coefficients:
## 
##                           Estimate  Std. Error t value Pr(>|t|)
## (Intercept)            1673907.766 2167519.346  0.7723   0.4406
## Davis2018$ClosingMonth     -55.526     122.224 -0.4543   0.6499

By using the lmtest and sandwich packages, I obtained slightly larger, but robust, standard errors.

reg2 <- lm(Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket)
summary(reg2)
## 
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -233976  -10198   -2858    9512   86338 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.240e+04  5.084e+03   4.405 1.48e-05 ***
## Davis2018$ListPrice     9.772e-01  7.297e-03 133.924  < 2e-16 ***
## Davis2018$DaysOnMarket -2.603e+02  4.371e+01  -5.957 7.30e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28130 on 296 degrees of freedom
## Multiple R-squared:  0.9848, Adjusted R-squared:  0.9847 
## F-statistic:  9569 on 2 and 296 DF,  p-value: < 2.2e-16

One interprets the slope coefficients in a multiple regression like this as the change in Sale price from a one-unit increase in List price or Days on market, holding the other regressor constant. Along with this, I don’t believe the zero conditional mean assumption is satisfied. Since there are only two regressors, neither of which are house characteristics, it is almost certain that house characteristics affect list price and days on market, and are biasing the OLS estimates of the included regressors.

reg3 <- lm(Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket + Davis2018$HasPool + Davis2018$Size)
summary(reg3)
## 
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket + 
##     Davis2018$HasPool + Davis2018$Size)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -233237   -9864   -3289   10110   88989 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             2.222e+04  5.092e+03   4.362 1.78e-05 ***
## Davis2018$ListPrice     9.939e-01  1.627e-02  61.093  < 2e-16 ***
## Davis2018$DaysOnMarket -2.580e+02  4.386e+01  -5.883 1.10e-08 ***
## Davis2018$HasPool       4.184e+01  3.921e+03   0.011    0.991    
## Davis2018$Size         -6.464e+00  5.722e+00  -1.130    0.260    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28160 on 294 degrees of freedom
## Multiple R-squared:  0.9848, Adjusted R-squared:  0.9846 
## F-statistic:  4775 on 4 and 294 DF,  p-value: < 2.2e-16

This regression, which adds the pool dummy variable and a regressor for size of the house, is highly significant at any threshold with an F-statistic of 4775.

reg4 <- lm(Davis2018$SalePrice ~ Davis2018$ListPrice + 
             Davis2018$DaysOnMarket + I(Davis2018$DaysOnMarket^2))
summary(reg4)
## 
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket + 
##     I(Davis2018$DaysOnMarket^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -232082  -10160   -3159   10242   85451 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  2.453e+04  5.283e+03   4.643 5.17e-06 ***
## Davis2018$ListPrice          9.767e-01  7.292e-03 133.942  < 2e-16 ***
## Davis2018$DaysOnMarket      -3.964e+02  1.034e+02  -3.833 0.000155 ***
## I(Davis2018$DaysOnMarket^2)  8.068e-01  5.560e-01   1.451 0.147821    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 28080 on 295 degrees of freedom
## Multiple R-squared:  0.9849, Adjusted R-squared:  0.9847 
## F-statistic:  6404 on 3 and 295 DF,  p-value: < 2.2e-16
reg4$coefficients
##                 (Intercept)         Davis2018$ListPrice 
##               24530.0085316                   0.9766877 
##      Davis2018$DaysOnMarket I(Davis2018$DaysOnMarket^2) 
##                -396.3832153                   0.8068395
-396.3832153*7 + 0.8068395*7
## [1] -2769.035

If a house is on the market a week longer than another, it will on average sell for 2,769 dollars less.

Question 2:

RENTAL <- Davis2018 <- read_dta("~/Downloads/RENTAL.dta")
rental90 <- RENTAL %>% 
  filter(y90 == 1)
reg5 <- lm(lrent~lpop+lavginc+pctstu, data = rental90)
summary(reg5)
## 
## Call:
## lm(formula = lrent ~ lpop + lavginc + pctstu, data = rental90)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.22706 -0.09469 -0.02827  0.03806  0.48271 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.042780   0.843875   0.051    0.960    
## lpop        0.065868   0.038826   1.696    0.095 .  
## lavginc     0.507015   0.080836   6.272 4.29e-08 ***
## pctstu      0.005630   0.001742   3.232    0.002 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1512 on 60 degrees of freedom
## Multiple R-squared:  0.4579, Adjusted R-squared:  0.4308 
## F-statistic: 16.89 on 3 and 60 DF,  p-value: 4.541e-08

If both the independent and dependent variable are log variables, the coefficient can be interpreted as an elasticity. So a unit increase in average income will result in .507 percent higher rents. Since the second variable, pctstu, is not in terms of log, we say that a one percentage point increase in percentage of population students results in .563 percent higher rents in the area.

##        lrent
## 1  0.5516071
## 2  0.4289236
## 3  0.4855080
## 4  0.7894783
## 5  0.6664791
## 6  0.8253188
## 7  0.5453234
## 8  0.4959292
## 9  0.8087320
## 10 0.4590969
## 11 0.6664791
## 12 0.6182160
## 13 0.5519438
## 14 0.6380877
## 15 0.6518292
## 16 0.5108256
## 17 0.4759617
## 18 0.5705447
## 19 0.5148497
## 20 0.5697684
## 21 0.5372767
## 22 0.3928337
## 23 0.7387824
## 24 0.5980387
## 25 0.5521994
## 26 0.6087804
## 27 0.5166907
## 28 0.4338241
## 29 0.4815888
## 30 0.4951186
## 31 0.5319953
## 32 0.7624297
## 33 0.7659063
## 34 0.6666079
## 35 0.6471848
## 36 0.6706743
## 37 0.6304369
## 38 0.4818382
## 39 0.4427724
## 40 0.5300832
## 41 0.4443808
## 42 0.4805326
## 43 0.3944759
## 44 0.4802251
## 45 0.4519849
## 46 0.5346365
## 47 0.6289048
## 48 0.6109095
## 49 0.6580558
## 50 0.4919286
## 51 0.5404701
## 52 0.4172158
## 53 0.4415412
## 54 0.5785809
## 55 0.5337491
## 56 0.5047464
## 57 0.5913644
## 58 0.7074475
## 59 0.5596156
## 60 0.4216728
## 61 0.4784904
## 62 0.4475307
## 63 0.4470139
## 64 0.6639175
##       lrent
##  [1,]  TRUE
##  [2,]  TRUE
##  [3,]  TRUE
##  [4,]  TRUE
##  [5,]  TRUE
##  [6,]  TRUE
##  [7,]  TRUE
##  [8,]  TRUE
##  [9,]  TRUE
## [10,]  TRUE
## [11,]  TRUE
## [12,]  TRUE
## [13,]  TRUE
## [14,]  TRUE
## [15,]  TRUE
## [16,]  TRUE
## [17,]  TRUE
## [18,]  TRUE
## [19,]  TRUE
## [20,]  TRUE
## [21,]  TRUE
## [22,]  TRUE
## [23,]  TRUE
## [24,]  TRUE
## [25,]  TRUE
## [26,]  TRUE
## [27,]  TRUE
## [28,]  TRUE
## [29,]  TRUE
## [30,]  TRUE
## [31,]  TRUE
## [32,]  TRUE
## [33,]  TRUE
## [34,]  TRUE
## [35,]  TRUE
## [36,]  TRUE
## [37,]  TRUE
## [38,]  TRUE
## [39,]  TRUE
## [40,]  TRUE
## [41,]  TRUE
## [42,]  TRUE
## [43,]  TRUE
## [44,]  TRUE
## [45,]  TRUE
## [46,]  TRUE
## [47,]  TRUE
## [48,]  TRUE
## [49,]  TRUE
## [50,]  TRUE
## [51,]  TRUE
## [52,]  TRUE
## [53,]  TRUE
## [54,]  TRUE
## [55,]  TRUE
## [56,]  TRUE
## [57,]  TRUE
## [58,]  TRUE
## [59,]  TRUE
## [60,]  TRUE
## [61,]  TRUE
## [62,]  TRUE
## [63,]  TRUE
## [64,]  TRUE

We can interpret clrent = .5516 as a .5516 percent change in rents when going from 1980 to 1990.

reg6 <- lm(clrent ~ clpop+clavginc+cpctstu, data = RENTAL) 
summary(reg6)
## 
## Call:
## lm(formula = clrent ~ clpop + clavginc + cpctstu, data = RENTAL)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.18697 -0.06216 -0.01438  0.05518  0.23783 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.385521   0.036824  10.469 3.66e-15 ***
## clpop       0.072246   0.088343   0.818  0.41671    
## clavginc    0.309961   0.066477   4.663 1.79e-05 ***
## cpctstu     0.011203   0.004132   2.711  0.00873 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09013 on 60 degrees of freedom
##   (64 observations deleted due to missingness)
## Multiple R-squared:  0.3223, Adjusted R-squared:  0.2884 
## F-statistic:  9.51 on 3 and 60 DF,  p-value: 3.136e-05

Holding change in population, average income, and percent of population that is students constant, the intercept is theoretically the change in rent without any of those factors, which could be roughly equivalent to inflation. Holding those constant, rents will increase .38 percent for a 10 unit increase in year (10 years later). Also, the zero conditional mean assumption likely doesn’t hold here, as many omitted factors likely contribute to average income and the makeup and size of the population.