The Sunday April 15, 2007 issue of the Houston Chronicle included a section devoted to real estate prices in Houston. In particular, data are presented on the 2006 median price per square foot for 1922 subdivisions. The data (HoustonRealEstate.txt) can be found on the book web site.
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.2
library(ggplot2)
houston <- read.csv("HoustonRealEstate.txt",sep="")
head(houston)
## Yi ni x1i x2i
## 1 169.20 7 0.857 0.000
## 2 56.82 6 0.167 0.667
## 3 25.52 6 0.000 1.000
## 4 90.67 5 0.800 0.200
## 5 92.65 8 0.500 0.000
## 6 87.76 9 0.667 0.000
OLS
OLS <- lm(Yi ~ x1i + x2i, data=houston)
summary(OLS)
##
## Call:
## lm(formula = Yi ~ x1i + x2i, data = houston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.54 -16.64 -9.02 1.94 385.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 89.469 1.138 78.603 <2e-16 ***
## x1i 5.867 3.023 1.941 0.0524 .
## x2i -90.896 5.915 -15.368 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.27 on 1919 degrees of freedom
## Multiple R-squared: 0.1209, Adjusted R-squared: 0.12
## F-statistic: 132 on 2 and 1919 DF, p-value: < 2.2e-16
plot(OLS$residuals ~ OLS$fitted.values)
WLS
WLS <- lm(Yi ~ x1i + x2i, data=houston,weights = (houston$ni))
summary(WLS)
##
## Call:
## lm(formula = Yi ~ x1i + x2i, data = houston, weights = (houston$ni))
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -271.74 -67.88 -32.78 15.85 1901.11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.767 1.104 80.383 <2e-16 ***
## x1i 4.262 2.754 1.548 0.122
## x2i -98.019 6.296 -15.568 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 162.6 on 1919 degrees of freedom
## Multiple R-squared: 0.1252, Adjusted R-squared: 0.1243
## F-statistic: 137.4 on 2 and 1919 DF, p-value: < 2.2e-16
plot(stdres(WLS) ~ WLS$fitted.values)
Huber
dist<-data.frame(cooks.distance(OLS))
s<-stdres(OLS)
sabs<-abs(s)
a<-cbind(houston,dist,s,sabs)
assorted<-a[order(-dist),]
assorted[1:10,]
## Yi ni x1i x2i cooks.distance.OLS. s sabs
## 540 475.22 5 0.000 0.00 0.046671980 11.261111 11.261111
## 1921 187.77 7 1.000 0.00 0.014810765 2.705072 2.705072
## 1669 319.75 68 0.103 0.00 0.013381100 6.704172 6.704172
## 688 317.60 14 0.143 0.00 0.012556299 6.634440 6.634440
## 211 288.66 11 0.000 0.00 0.012444607 5.814920 5.814920
## 262 297.01 7 0.286 0.00 0.011148378 6.009158 6.009158
## 463 285.69 5 0.200 0.00 0.009136855 5.693226 5.693226
## 484 282.41 7 0.143 0.00 0.008969268 5.607276 5.607276
## 977 59.38 42 0.024 0.81 0.008176660 1.275657 1.275657
## 81 272.56 5 0.200 0.00 0.007948128 5.309975 5.309975
rr.huber<-rlm(Yi ~ x1i + x2i, data=houston)
summary(rr.huber)
##
## Call: rlm(formula = Yi ~ x1i + x2i, data = houston)
## Residuals:
## Min 1Q Median 3Q Max
## -47.489 -7.950 -1.645 7.978 397.801
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 77.4187 0.4338 178.4466
## x1i 6.6952 1.1521 5.8111
## x2i -54.3445 2.2545 -24.1052
##
## Residual standard error: 11.8 on 1919 degrees of freedom
weights<-data.frame(Yi=houston$Yi,resid=rr.huber$resid,w=rr.huber$w)
head(weights)
## Yi resid w
## 1 169.20 86.043549 0.1844998
## 2 56.82 14.530990 1.0000000
## 3 25.52 2.445801 1.0000000
## 4 90.67 18.764074 0.8460722
## 5 92.65 11.883718 1.0000000
## 6 87.76 5.875628 1.0000000
plot(rr.huber$resid ~ rr.huber$fitted.values)
training_rows <- sample(1:nrow(houston), 0.8 * nrow(houston))
training_data <- houston[training_rows, ]
test_data <- houston[-training_rows, ]
lm_Predicted <- predict(OLS, test_data)
rob_Predicted <- predict(rr.huber, test_data)
lm_actuals_pred <- cbind(lm_Predicted, test_data$Yi)
rob_actuals_pred <- cbind(rob_Predicted, test_data$Yi)
mean(apply(lm_actuals_pred, 1, min)/
apply(lm_actuals_pred, 1, max))
## [1] 0.8156891
colnames(rob_actuals_pred)<-c("predicted","actual")
colnames(lm_actuals_pred)<-c("predicted","actual")
head(lm_actuals_pred)
## predicted actual
## 21 79.82122 74.96
## 26 94.08087 97.43
## 28 54.28409 80.99
## 34 61.15450 96.42
## 35 82.64190 128.64
## 41 38.29452 31.48
rob_actuals_pred<-data.frame(rob_actuals_pred)
lm_actuals_pred<-data.frame(lm_actuals_pred)
plot(lm_actuals_pred$actual,lm_actuals_pred$predicted,col="blue")
abline(0,1)
points(rob_actuals_pred$actual,rob_actuals_pred$predicted,col="red")
abline(0,1)
mean(apply(rob_actuals_pred, 1, min)/
apply(rob_actuals_pred, 1, max))
## [1] 0.8484765