These data come from Eric Fischer’s github, and resulted from some fairly obsessive data collection described here
My aim is to provide better graphs and some slightly different models
Unfortunately, while most of the data file is comma-separated, the first two columns are separated only by a space, so we need a bit of gymnastics
combined<-read.table("../combined",sep=",")
combined<-combined[,1:6]
year<-as.integer(gsub(" .*","",as.character(combined[,1])))
medrent<-as.numeric(gsub("^[0-9]+","",as.character(combined[,1])))
SF<-cbind(year=year,median_rent=medrent, combined[,-1])
names(SF)<-c("year","median_rent","housing_units","net_new_housing","employment","total_wages","CPI")
head(SF)
## year median_rent housing_units net_new_housing employment total_wages
## 1 1906 NA 84077 8444 NA NA
## 2 1907 NA 100342 16265 NA NA
## 3 1908 NA 111220 10878 NA NA
## 4 1909 NA 117465 6245 NA NA
## 5 1910 NA 125112 7647 NA NA
## 6 1911 NA 131757 6645 NA NA
## CPI
## 1 NA
## 2 NA
## 3 NA
## 4 NA
## 5 NA
## 6 NA
summary(SF)
## year median_rent housing_units net_new_housing
## Min. :1906 Min. : 35.0 Min. : 84077 Min. : 150
## 1st Qu.:1934 1st Qu.: 131.2 1st Qu.:235376 1st Qu.: 1208
## Median :1961 Median : 503.5 Median :295256 Median : 1902
## Mean :1961 Mean : 918.3 Mean :277199 Mean : 2765
## 3rd Qu.:1988 3rd Qu.:1643.8 3rd Qu.:334399 3rd Qu.: 3321
## Max. :2016 Max. :3650.0 Max. :379597 Max. :16265
## NA's :41 NA's :2 NA's :1
## employment total_wages CPI
## Min. :440498 Min. :5.423e+09 Min. :0.1000
## 1st Qu.:532506 1st Qu.:1.397e+10 1st Qu.:0.1745
## Median :555131 Median :2.051e+10 Median :0.3120
## Mean :550557 Mean :2.510e+10 Mean :0.7263
## 3rd Qu.:571902 3rd Qu.:3.523e+10 3rd Qu.:1.2580
## Max. :640378 Max. :5.884e+10 Max. :2.3277
## NA's :71 NA's :71 NA's :8
Nominal rental rates
plot(median_rent~year, data=SF, type="p",log="y")
with(SF, lines(supsmu(year,median_rent),col="purple",lwd=2))
## Warning in supsmu(year, median_rent): 41 observations with NAs, NaNs and/or
## Infs deleted
Real rental rates, after 1948, with a scatterplot smoother
SF$real84_median_rent<-with(SF,median_rent/CPI)
plot(real84_median_rent~year, data=subset(SF, year>=1948), type="p",log="y",
xlab="year",ylab="Median rent (1983 dollars)")
with(subset(SF,year>=1948), lines(supsmu(year,real84_median_rent),col="purple",lwd=2))
## Warning in supsmu(year, real84_median_rent): 1 observation with NA, NaN or
## Inf deleted
With trend line, getting rid of years with basically no data
SFok<-subset(SF, year>=1948 & year <=2015)
SFok$real84_median_rent<-with(SFok,median_rent/CPI)
plot(real84_median_rent~year, data=SFok, type="p",log="y",
xlab="year",ylab="Median rent (1983 dollars)")
with(SFok, lines(supsmu(year,real84_median_rent),col="sienna",lwd=2))
cat("annual increase=",
exp(coef(trend_model<-glm(real84_median_rent~year,family=gaussian(log),data=SFok))[2]))
## annual increase= 1.023962
with(SFok, lines(year, fitted(trend_model),lty=2,col="blue"))
legend("bottomright",lty=c(1,2),col=c("sienna","blue"),
bty="n",legend=c("smoother","constant real % increase"))
Eric Fischer used a multiplicative model for powers of explanatory variables, or more-or-less equivalently, a linear model for logarithms. Also, he uses log10 rather than log
pricemodel<-lm(log10(real84_median_rent)~log10(housing_units)+log10(employment)+log10(total_wages/CPI),
data=SFok,na.action=na.exclude)
summary(pricemodel)
##
## Call:
## lm(formula = log10(real84_median_rent) ~ log10(housing_units) +
## log10(employment) + log10(total_wages/CPI), data = SFok,
## na.action = na.exclude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.067660 -0.018309 -0.000776 0.012929 0.103568
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.9019 3.8627 -0.233 0.8167
## log10(housing_units) -2.4115 0.9161 -2.632 0.0124 *
## log10(employment) -0.3361 0.2433 -1.382 0.1756
## log10(total_wages/CPI) 1.8729 0.2094 8.945 1.12e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03386 on 36 degrees of freedom
## (28 observations deleted due to missingness)
## Multiple R-squared: 0.9512, Adjusted R-squared: 0.9471
## F-statistic: 233.7 on 3 and 36 DF, p-value: < 2.2e-16
plot(real84_median_rent~year, data=SFok, type="p",log="y",
xlab="year",ylab="Median rent (1983 dollars)")
lines(SFok$year, 10^(fitted(pricemodel)),col="purple")
Not quite what he found: maybe due to minimising squared error on the log scale rather than the raw scale?
pricemodel2<-glm(real84_median_rent~log10(housing_units)+log10(employment)+log10(total_wages/CPI),
data=SFok,na.action=na.exclude,family=gaussian(log))
summary(pricemodel2)
##
## Call:
## glm(formula = real84_median_rent ~ log10(housing_units) + log10(employment) +
## log10(total_wages/CPI), family = gaussian(log), data = SFok,
## na.action = na.exclude)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -129.78 -36.25 -10.77 25.99 244.49
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.6024 8.2109 -0.561 0.57859
## log10(housing_units) -5.0906 1.8103 -2.812 0.00792 **
## log10(employment) -0.3233 0.6855 -0.472 0.64005
## log10(total_wages/CPI) 4.0561 0.4441 9.133 6.61e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 5031.032)
##
## Null deviance: 3065328 on 39 degrees of freedom
## Residual deviance: 181117 on 36 degrees of freedom
## (28 observations deleted due to missingness)
## AIC: 460.24
##
## Number of Fisher Scoring iterations: 4
coef(pricemodel2)/log(10)
## (Intercept) log10(housing_units) log10(employment)
## -1.9988009 -2.2108258 -0.1404021
## log10(total_wages/CPI)
## 1.7615320
plot(real84_median_rent~year, data=SFok, type="p",log="y",
xlab="year",ylab="Median rent (1983 dollars)",xlim=c(1975,2015))
lines(SFok$year, 10^(fitted(pricemodel)),col="purple")
lines(SFok$year, (fitted(pricemodel2)),col="forestgreen")
legend("bottomright",lty=1,col=c("purple","forestgreen"),
legend=c("fit on log scale","fit on raw scale"),bty="n")
Still not quite the same, but close enough.
As Eric found, San Francisco real median asking prices for rentals have been going up about 2.5% per year since the 1950s, and can be modelled quite accurately since 1975 using employment, real salaries and wages, and number of housing units.