Data source

These data come from Eric Fischer’s github, and resulted from some fairly obsessive data collection described here

My aim is to provide better graphs and some slightly different models

Reading the data

Unfortunately, while most of the data file is comma-separated, the first two columns are separated only by a space, so we need a bit of gymnastics

combined<-read.table("../combined",sep=",")
combined<-combined[,1:6]
year<-as.integer(gsub(" .*","",as.character(combined[,1])))
medrent<-as.numeric(gsub("^[0-9]+","",as.character(combined[,1])))
SF<-cbind(year=year,median_rent=medrent, combined[,-1])
names(SF)<-c("year","median_rent","housing_units","net_new_housing","employment","total_wages","CPI")
head(SF)
##   year median_rent housing_units net_new_housing employment total_wages
## 1 1906          NA         84077            8444         NA          NA
## 2 1907          NA        100342           16265         NA          NA
## 3 1908          NA        111220           10878         NA          NA
## 4 1909          NA        117465            6245         NA          NA
## 5 1910          NA        125112            7647         NA          NA
## 6 1911          NA        131757            6645         NA          NA
##   CPI
## 1  NA
## 2  NA
## 3  NA
## 4  NA
## 5  NA
## 6  NA
summary(SF)
##       year       median_rent     housing_units    net_new_housing
##  Min.   :1906   Min.   :  35.0   Min.   : 84077   Min.   :  150  
##  1st Qu.:1934   1st Qu.: 131.2   1st Qu.:235376   1st Qu.: 1208  
##  Median :1961   Median : 503.5   Median :295256   Median : 1902  
##  Mean   :1961   Mean   : 918.3   Mean   :277199   Mean   : 2765  
##  3rd Qu.:1988   3rd Qu.:1643.8   3rd Qu.:334399   3rd Qu.: 3321  
##  Max.   :2016   Max.   :3650.0   Max.   :379597   Max.   :16265  
##                 NA's   :41       NA's   :2        NA's   :1      
##    employment      total_wages             CPI        
##  Min.   :440498   Min.   :5.423e+09   Min.   :0.1000  
##  1st Qu.:532506   1st Qu.:1.397e+10   1st Qu.:0.1745  
##  Median :555131   Median :2.051e+10   Median :0.3120  
##  Mean   :550557   Mean   :2.510e+10   Mean   :0.7263  
##  3rd Qu.:571902   3rd Qu.:3.523e+10   3rd Qu.:1.2580  
##  Max.   :640378   Max.   :5.884e+10   Max.   :2.3277  
##  NA's   :71       NA's   :71          NA's   :8

Models

Eric Fischer used a multiplicative model for powers of explanatory variables, or more-or-less equivalently, a linear model for logarithms. Also, he uses log10 rather than log

pricemodel<-lm(log10(real84_median_rent)~log10(housing_units)+log10(employment)+log10(total_wages/CPI),
               data=SFok,na.action=na.exclude)
summary(pricemodel)
## 
## Call:
## lm(formula = log10(real84_median_rent) ~ log10(housing_units) + 
##     log10(employment) + log10(total_wages/CPI), data = SFok, 
##     na.action = na.exclude)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.067660 -0.018309 -0.000776  0.012929  0.103568 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -0.9019     3.8627  -0.233   0.8167    
## log10(housing_units)    -2.4115     0.9161  -2.632   0.0124 *  
## log10(employment)       -0.3361     0.2433  -1.382   0.1756    
## log10(total_wages/CPI)   1.8729     0.2094   8.945 1.12e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03386 on 36 degrees of freedom
##   (28 observations deleted due to missingness)
## Multiple R-squared:  0.9512, Adjusted R-squared:  0.9471 
## F-statistic: 233.7 on 3 and 36 DF,  p-value: < 2.2e-16
plot(real84_median_rent~year, data=SFok, type="p",log="y",
     xlab="year",ylab="Median rent (1983 dollars)")
lines(SFok$year, 10^(fitted(pricemodel)),col="purple")

Not quite what he found: maybe due to minimising squared error on the log scale rather than the raw scale?

pricemodel2<-glm(real84_median_rent~log10(housing_units)+log10(employment)+log10(total_wages/CPI),
               data=SFok,na.action=na.exclude,family=gaussian(log))
summary(pricemodel2)
## 
## Call:
## glm(formula = real84_median_rent ~ log10(housing_units) + log10(employment) + 
##     log10(total_wages/CPI), family = gaussian(log), data = SFok, 
##     na.action = na.exclude)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -129.78   -36.25   -10.77    25.99   244.49  
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             -4.6024     8.2109  -0.561  0.57859    
## log10(housing_units)    -5.0906     1.8103  -2.812  0.00792 ** 
## log10(employment)       -0.3233     0.6855  -0.472  0.64005    
## log10(total_wages/CPI)   4.0561     0.4441   9.133 6.61e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 5031.032)
## 
##     Null deviance: 3065328  on 39  degrees of freedom
## Residual deviance:  181117  on 36  degrees of freedom
##   (28 observations deleted due to missingness)
## AIC: 460.24
## 
## Number of Fisher Scoring iterations: 4
coef(pricemodel2)/log(10)
##            (Intercept)   log10(housing_units)      log10(employment) 
##             -1.9988009             -2.2108258             -0.1404021 
## log10(total_wages/CPI) 
##              1.7615320
plot(real84_median_rent~year, data=SFok, type="p",log="y",
     xlab="year",ylab="Median rent (1983 dollars)",xlim=c(1975,2015))
lines(SFok$year, 10^(fitted(pricemodel)),col="purple")
lines(SFok$year, (fitted(pricemodel2)),col="forestgreen")
legend("bottomright",lty=1,col=c("purple","forestgreen"),
       legend=c("fit on log scale","fit on raw scale"),bty="n")

Still not quite the same, but close enough.

As Eric found, San Francisco real median asking prices for rentals have been going up about 2.5% per year since the 1950s, and can be modelled quite accurately since 1975 using employment, real salaries and wages, and number of housing units.