Install packages necessary:
knitr::opts_chunk$set(echo = TRUE)
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(sandwich)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
Davis2018 <- read_dta("~/Downloads/Davis2018.dta")
Davis2018$ClosingYear <- substr(Davis2018$ClosingDate, 1, 4)
Davis2018$ClosingYear <- as.numeric(Davis2018$ClosingYear)
Davis2018$ClosingMonth <- substr(Davis2018$ClosingDate, 6, 7)
Davis2018$ClosingMonth <- as.numeric(Davis2018$ClosingDate)
sample2018 <- Davis2018 %>%
filter(ClosingYear == 2018)
fig.dist <- split(sample2018$SalePrice, sample2018$Bedroom)
fig.mean <- sapply(fig.dist, mean, na.rm = T)
houseplot <- barplot(fig.mean, xlab = 'Bedrooms', ylab = 'Average Sale Price')
reg <- lm(Davis2018$SalePrice~Davis2018$ClosingMonth)
summary(reg)
##
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ClosingMonth)
##
## Residuals:
## Min 1Q Median 3Q Max
## -394875 -183880 -14545 117956 1021005
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1673907.77 2077479.46 0.806 0.421
## Davis2018$ClosingMonth -55.53 117.07 -0.474 0.636
##
## Residual standard error: 227500 on 297 degrees of freedom
## Multiple R-squared: 0.0007569, Adjusted R-squared: -0.002608
## F-statistic: 0.225 on 1 and 297 DF, p-value: 0.6356
With an F statistic of .225 leading to a p-value of .6356, we find the overall regression is not significant at the .05 level.
coeftest(lm(Davis2018$SalePrice~Davis2018$ClosingMonth), vcov=sandwich)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1673907.766 2167519.346 0.7723 0.4406
## Davis2018$ClosingMonth -55.526 122.224 -0.4543 0.6499
By using the lmtest and sandwich packages, I obtained slightly larger, but robust, standard errors.
reg2 <- lm(Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket)
summary(reg2)
##
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233976 -10198 -2858 9512 86338
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.240e+04 5.084e+03 4.405 1.48e-05 ***
## Davis2018$ListPrice 9.772e-01 7.297e-03 133.924 < 2e-16 ***
## Davis2018$DaysOnMarket -2.603e+02 4.371e+01 -5.957 7.30e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28130 on 296 degrees of freedom
## Multiple R-squared: 0.9848, Adjusted R-squared: 0.9847
## F-statistic: 9569 on 2 and 296 DF, p-value: < 2.2e-16
One interprets the slope coefficients in a multiple regression like this as the change in Sale price from a one-unit increase in List price or Days on market, holding the other regressor constant. Along with this, I don’t believe the zero conditional mean assumption is satisfied. Since there are only two regressors, neither of which are house characteristics, it is almost certain that house characteristics affect list price and days on market, and are biasing the OLS estimates of the included regressors.
reg3 <- lm(Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket + Davis2018$HasPool + Davis2018$Size)
summary(reg3)
##
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket +
## Davis2018$HasPool + Davis2018$Size)
##
## Residuals:
## Min 1Q Median 3Q Max
## -233237 -9864 -3289 10110 88989
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.222e+04 5.092e+03 4.362 1.78e-05 ***
## Davis2018$ListPrice 9.939e-01 1.627e-02 61.093 < 2e-16 ***
## Davis2018$DaysOnMarket -2.580e+02 4.386e+01 -5.883 1.10e-08 ***
## Davis2018$HasPool 4.184e+01 3.921e+03 0.011 0.991
## Davis2018$Size -6.464e+00 5.722e+00 -1.130 0.260
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28160 on 294 degrees of freedom
## Multiple R-squared: 0.9848, Adjusted R-squared: 0.9846
## F-statistic: 4775 on 4 and 294 DF, p-value: < 2.2e-16
This regression, which adds the pool dummy variable and a regressor for size of the house, is highly significant at any threshold with an F-statistic of 4775.
reg4 <- lm(Davis2018$SalePrice ~ Davis2018$ListPrice +
Davis2018$DaysOnMarket + I(Davis2018$DaysOnMarket^2))
summary(reg4)
##
## Call:
## lm(formula = Davis2018$SalePrice ~ Davis2018$ListPrice + Davis2018$DaysOnMarket +
## I(Davis2018$DaysOnMarket^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -232082 -10160 -3159 10242 85451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.453e+04 5.283e+03 4.643 5.17e-06 ***
## Davis2018$ListPrice 9.767e-01 7.292e-03 133.942 < 2e-16 ***
## Davis2018$DaysOnMarket -3.964e+02 1.034e+02 -3.833 0.000155 ***
## I(Davis2018$DaysOnMarket^2) 8.068e-01 5.560e-01 1.451 0.147821
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 28080 on 295 degrees of freedom
## Multiple R-squared: 0.9849, Adjusted R-squared: 0.9847
## F-statistic: 6404 on 3 and 295 DF, p-value: < 2.2e-16
reg4$coefficients
## (Intercept) Davis2018$ListPrice
## 24530.0085316 0.9766877
## Davis2018$DaysOnMarket I(Davis2018$DaysOnMarket^2)
## -396.3832153 0.8068395
-396.3832153*7 + 0.8068395*7
## [1] -2769.035
If a house is on the market a week longer than another, it will on average sell for 2,769 dollars less.
Question 2:
RENTAL <- Davis2018 <- read_dta("~/Downloads/RENTAL.dta")
rental90 <- RENTAL %>%
filter(y90 == 1)
reg5 <- lm(lrent~lpop+lavginc+pctstu, data = rental90)
summary(reg5)
##
## Call:
## lm(formula = lrent ~ lpop + lavginc + pctstu, data = rental90)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22706 -0.09469 -0.02827 0.03806 0.48271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.042780 0.843875 0.051 0.960
## lpop 0.065868 0.038826 1.696 0.095 .
## lavginc 0.507015 0.080836 6.272 4.29e-08 ***
## pctstu 0.005630 0.001742 3.232 0.002 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1512 on 60 degrees of freedom
## Multiple R-squared: 0.4579, Adjusted R-squared: 0.4308
## F-statistic: 16.89 on 3 and 60 DF, p-value: 4.541e-08
If both the independent and dependent variable are log variables, the coefficient can be interpreted as an elasticity. So a unit increase in average income will result in .507 percent higher rents. Since the second variable, pctstu, is not in terms of log, we say that a one percentage point increase in percentage of population students results in .563 percent higher rents in the area.
## lrent
## 1 0.5516071
## 2 0.4289236
## 3 0.4855080
## 4 0.7894783
## 5 0.6664791
## 6 0.8253188
## 7 0.5453234
## 8 0.4959292
## 9 0.8087320
## 10 0.4590969
## 11 0.6664791
## 12 0.6182160
## 13 0.5519438
## 14 0.6380877
## 15 0.6518292
## 16 0.5108256
## 17 0.4759617
## 18 0.5705447
## 19 0.5148497
## 20 0.5697684
## 21 0.5372767
## 22 0.3928337
## 23 0.7387824
## 24 0.5980387
## 25 0.5521994
## 26 0.6087804
## 27 0.5166907
## 28 0.4338241
## 29 0.4815888
## 30 0.4951186
## 31 0.5319953
## 32 0.7624297
## 33 0.7659063
## 34 0.6666079
## 35 0.6471848
## 36 0.6706743
## 37 0.6304369
## 38 0.4818382
## 39 0.4427724
## 40 0.5300832
## 41 0.4443808
## 42 0.4805326
## 43 0.3944759
## 44 0.4802251
## 45 0.4519849
## 46 0.5346365
## 47 0.6289048
## 48 0.6109095
## 49 0.6580558
## 50 0.4919286
## 51 0.5404701
## 52 0.4172158
## 53 0.4415412
## 54 0.5785809
## 55 0.5337491
## 56 0.5047464
## 57 0.5913644
## 58 0.7074475
## 59 0.5596156
## 60 0.4216728
## 61 0.4784904
## 62 0.4475307
## 63 0.4470139
## 64 0.6639175
## lrent
## [1,] TRUE
## [2,] TRUE
## [3,] TRUE
## [4,] TRUE
## [5,] TRUE
## [6,] TRUE
## [7,] TRUE
## [8,] TRUE
## [9,] TRUE
## [10,] TRUE
## [11,] TRUE
## [12,] TRUE
## [13,] TRUE
## [14,] TRUE
## [15,] TRUE
## [16,] TRUE
## [17,] TRUE
## [18,] TRUE
## [19,] TRUE
## [20,] TRUE
## [21,] TRUE
## [22,] TRUE
## [23,] TRUE
## [24,] TRUE
## [25,] TRUE
## [26,] TRUE
## [27,] TRUE
## [28,] TRUE
## [29,] TRUE
## [30,] TRUE
## [31,] TRUE
## [32,] TRUE
## [33,] TRUE
## [34,] TRUE
## [35,] TRUE
## [36,] TRUE
## [37,] TRUE
## [38,] TRUE
## [39,] TRUE
## [40,] TRUE
## [41,] TRUE
## [42,] TRUE
## [43,] TRUE
## [44,] TRUE
## [45,] TRUE
## [46,] TRUE
## [47,] TRUE
## [48,] TRUE
## [49,] TRUE
## [50,] TRUE
## [51,] TRUE
## [52,] TRUE
## [53,] TRUE
## [54,] TRUE
## [55,] TRUE
## [56,] TRUE
## [57,] TRUE
## [58,] TRUE
## [59,] TRUE
## [60,] TRUE
## [61,] TRUE
## [62,] TRUE
## [63,] TRUE
## [64,] TRUE
We can interpret clrent = .5516 as a .5516 percent change in rents when going from 1980 to 1990.
reg6 <- lm(clrent ~ clpop+clavginc+cpctstu, data = RENTAL)
summary(reg6)
##
## Call:
## lm(formula = clrent ~ clpop + clavginc + cpctstu, data = RENTAL)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18697 -0.06216 -0.01438 0.05518 0.23783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.385521 0.036824 10.469 3.66e-15 ***
## clpop 0.072246 0.088343 0.818 0.41671
## clavginc 0.309961 0.066477 4.663 1.79e-05 ***
## cpctstu 0.011203 0.004132 2.711 0.00873 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.09013 on 60 degrees of freedom
## (64 observations deleted due to missingness)
## Multiple R-squared: 0.3223, Adjusted R-squared: 0.2884
## F-statistic: 9.51 on 3 and 60 DF, p-value: 3.136e-05
Holding change in population, average income, and percent of population that is students constant, the intercept is theoretically the change in rent without any of those factors, which could be roughly equivalent to inflation. Holding those constant, rents will increase .38 percent for a 10 unit increase in year (10 years later). Also, the zero conditional mean assumption likely doesn’t hold here, as many omitted factors likely contribute to average income and the makeup and size of the population.