MARR 4.3

The Sunday April 15, 2007 issue of the Houston Chronicle included a section devoted to real estate prices in Houston. In particular, data are presented on the 2006 median price per square foot for 1922 subdivisions. The data (HoustonRealEstate.txt) can be found on the book web site.

library(MASS)
## Warning: package 'MASS' was built under R version 3.5.2
library(ggplot2)
houston <- read.csv("HoustonRealEstate.txt",sep="")
head(houston)
##       Yi ni   x1i   x2i
## 1 169.20  7 0.857 0.000
## 2  56.82  6 0.167 0.667
## 3  25.52  6 0.000 1.000
## 4  90.67  5 0.800 0.200
## 5  92.65  8 0.500 0.000
## 6  87.76  9 0.667 0.000

OLS

OLS <- lm(Yi ~ x1i + x2i, data=houston)
summary(OLS)
## 
## Call:
## lm(formula = Yi ~ x1i + x2i, data = houston)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -59.54 -16.64  -9.02   1.94 385.75 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   89.469      1.138  78.603   <2e-16 ***
## x1i            5.867      3.023   1.941   0.0524 .  
## x2i          -90.896      5.915 -15.368   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.27 on 1919 degrees of freedom
## Multiple R-squared:  0.1209, Adjusted R-squared:   0.12 
## F-statistic:   132 on 2 and 1919 DF,  p-value: < 2.2e-16
plot(OLS$residuals ~ OLS$fitted.values)

WLS

WLS <- lm(Yi ~ x1i + x2i, data=houston,weights = (houston$ni))
summary(WLS)
## 
## Call:
## lm(formula = Yi ~ x1i + x2i, data = houston, weights = (houston$ni))
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -271.74  -67.88  -32.78   15.85 1901.11 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   88.767      1.104  80.383   <2e-16 ***
## x1i            4.262      2.754   1.548    0.122    
## x2i          -98.019      6.296 -15.568   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 162.6 on 1919 degrees of freedom
## Multiple R-squared:  0.1252, Adjusted R-squared:  0.1243 
## F-statistic: 137.4 on 2 and 1919 DF,  p-value: < 2.2e-16
plot(stdres(WLS) ~ WLS$fitted.values)

Huber

dist<-data.frame(cooks.distance(OLS))
s<-stdres(OLS)
sabs<-abs(s)
a<-cbind(houston,dist,s,sabs)
assorted<-a[order(-dist),]
assorted[1:10,]
##          Yi ni   x1i  x2i cooks.distance.OLS.         s      sabs
## 540  475.22  5 0.000 0.00         0.046671980 11.261111 11.261111
## 1921 187.77  7 1.000 0.00         0.014810765  2.705072  2.705072
## 1669 319.75 68 0.103 0.00         0.013381100  6.704172  6.704172
## 688  317.60 14 0.143 0.00         0.012556299  6.634440  6.634440
## 211  288.66 11 0.000 0.00         0.012444607  5.814920  5.814920
## 262  297.01  7 0.286 0.00         0.011148378  6.009158  6.009158
## 463  285.69  5 0.200 0.00         0.009136855  5.693226  5.693226
## 484  282.41  7 0.143 0.00         0.008969268  5.607276  5.607276
## 977   59.38 42 0.024 0.81         0.008176660  1.275657  1.275657
## 81   272.56  5 0.200 0.00         0.007948128  5.309975  5.309975
rr.huber<-rlm(Yi ~ x1i + x2i, data=houston)
summary(rr.huber)
## 
## Call: rlm(formula = Yi ~ x1i + x2i, data = houston)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -47.489  -7.950  -1.645   7.978 397.801 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept)  77.4187   0.4338   178.4466
## x1i           6.6952   1.1521     5.8111
## x2i         -54.3445   2.2545   -24.1052
## 
## Residual standard error: 11.8 on 1919 degrees of freedom
weights<-data.frame(Yi=houston$Yi,resid=rr.huber$resid,w=rr.huber$w)
head(weights)
##       Yi     resid         w
## 1 169.20 86.043549 0.1844998
## 2  56.82 14.530990 1.0000000
## 3  25.52  2.445801 1.0000000
## 4  90.67 18.764074 0.8460722
## 5  92.65 11.883718 1.0000000
## 6  87.76  5.875628 1.0000000
plot(rr.huber$resid ~ rr.huber$fitted.values)

training_rows <- sample(1:nrow(houston), 0.8 * nrow(houston))  
training_data <- houston[training_rows, ] 
test_data <- houston[-training_rows, ] 
    
lm_Predicted <- predict(OLS, test_data)
rob_Predicted <- predict(rr.huber, test_data)
    
lm_actuals_pred <- cbind(lm_Predicted, test_data$Yi)
rob_actuals_pred <- cbind(rob_Predicted, test_data$Yi)
    
mean(apply(lm_actuals_pred, 1, min)/
        apply(lm_actuals_pred, 1, max))  
## [1] 0.8156891
colnames(rob_actuals_pred)<-c("predicted","actual")
colnames(lm_actuals_pred)<-c("predicted","actual")
head(lm_actuals_pred)
##    predicted actual
## 21  79.82122  74.96
## 26  94.08087  97.43
## 28  54.28409  80.99
## 34  61.15450  96.42
## 35  82.64190 128.64
## 41  38.29452  31.48
rob_actuals_pred<-data.frame(rob_actuals_pred)
lm_actuals_pred<-data.frame(lm_actuals_pred)
plot(lm_actuals_pred$actual,lm_actuals_pred$predicted,col="blue")
abline(0,1)
points(rob_actuals_pred$actual,rob_actuals_pred$predicted,col="red")
abline(0,1)

mean(apply(rob_actuals_pred, 1, min)/ 
        apply(rob_actuals_pred, 1, max))
## [1] 0.8484765