require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)

1 残差

housing <-  read.table ("http://www.jaredlander.com/data/housing.csv",
 sep = ",", header = TRUE,
 stringsAsFactors = FALSE) 
# わかりやすい列名に変更
names(housing) <- c("Neighborhood" , "Class" , "Units" , "YearBuilt" ,
"SqFt" , "Income" , "IncomeperSqFt" , "Expence" ,
"ExpencePerSqFt" , "NetIncome" , "Value" ,
"ValueperSqFt" , "Boro")
# 外れ値を除去
housing <- housing[housing$Units < 1000 ,]
head(housing)
##   Neighborhood          Class Units YearBuilt   SqFt   Income
## 1    FINANCIAL R9-CONDOMINIUM    42      1920  36500  1332615
## 2    FINANCIAL R4-CONDOMINIUM    78      1985 126420  6633257
## 3    FINANCIAL RR-CONDOMINIUM   500        NA 554174 17310000
## 4    FINANCIAL R4-CONDOMINIUM   282      1930 249076 11776313
## 5      TRIBECA R4-CONDOMINIUM   239      1985 219495 10004582
## 6      TRIBECA R4-CONDOMINIUM   133      1986 139719  5127687
##   IncomeperSqFt Expence ExpencePerSqFt NetIncome    Value ValueperSqFt
## 1         36.51  342005           9.37    990610  7300000       200.00
## 2         52.47 1762295          13.94   4870962 30690000       242.76
## 3         31.24 3543000           6.39  13767000 90970000       164.15
## 4         47.28 2784670          11.18   8991643 67556006       271.23
## 5         45.58 2783197          12.68   7221385 54320996       247.48
## 6         36.70 1497788          10.72   3629899 26737996       191.37
##        Boro
## 1 Manhattan
## 2 Manhattan
## 3 Manhattan
## 4 Manhattan
## 5 Manhattan
## 6 Manhattan
house1 <- lm(ValueperSqFt ~ Units + SqFt + Boro , data = housing)
summary(house1)
## 
## Call:
## lm(formula = ValueperSqFt ~ Units + SqFt + Boro, data = housing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -168.458  -22.680    1.493   26.290  261.761 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.430e+01  5.342e+00   8.293  < 2e-16 ***
## Units             -1.532e-01  2.421e-02  -6.330 2.88e-10 ***
## SqFt               2.070e-04  2.129e-05   9.723  < 2e-16 ***
## BoroBrooklyn       3.258e+01  5.561e+00   5.858 5.28e-09 ***
## BoroManhattan      1.274e+02  5.459e+00  23.343  < 2e-16 ***
## BoroQueens         3.011e+01  5.711e+00   5.272 1.46e-07 ***
## BoroStaten Island -7.114e+00  1.001e+01  -0.711    0.477    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.2 on 2613 degrees of freedom
## Multiple R-squared:  0.6034, Adjusted R-squared:  0.6025 
## F-statistic: 662.6 on 6 and 2613 DF,  p-value: < 2.2e-16
coefplot(house1)

回帰の結果を詳細に分析するにはbroomパッケージを使う。

require(broom)
## Loading required package: broom
head(augment(house1))
##   .rownames ValueperSqFt Units   SqFt      Boro  .fitted  .se.fit
## 1         1       200.00    42  36500 Manhattan 172.8475 1.338282
## 2         2       242.76    78 126420 Manhattan 185.9418 1.312763
## 3         3       164.15   500 554174 Manhattan 209.8077 4.095022
## 4         4       271.23   282 249076 Manhattan 180.0672 2.562165
## 5         5       247.48   239 219495 Manhattan 180.5341 2.110663
## 6         6       191.37   133 139719 Manhattan 180.2661 1.291444
##      .resid         .hat   .sigma      .cooksd .std.resid
## 1  27.15248 0.0009594821 43.20952 5.424169e-05  0.6287655
## 2  56.81815 0.0009232393 43.19848 2.285253e-04  1.3157048
## 3 -45.65775 0.0089836758 43.20347 1.459368e-03 -1.0615607
## 4  91.16278 0.0035168641 43.17583 2.252653e-03  2.1137487
## 5  66.94589 0.0023865978 43.19289 8.225193e-04  1.5513636
## 6  11.10385 0.0008934957 43.21225 8.446170e-06  0.2571216

1.1 適合値

ggplot(aes(x = .fitted , y = .resid) , data = house1) +
 geom_point(aes(color = Boro)) +
 geom_hline(yintercept = 0) +
 geom_smooth(se = FALSE) +
 labs(x = "Fitted Value" , y = "Residuals")

## Q-Q plot
Q-Q plotは理論値とデータがどれくらい一致するかを調べるプロット。Q-Q plotの説明はこのサイトが分かりやすい。

ggplot(house1 , aes(sample = .stdresid)) + stat_qq() + geom_abline()

フィットがよければ直線上に乗る。

1.2 残差のヒストグラム

ggplot(house1 , aes(x = .resid)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ガウシアンに見えるのでフィットはうまくいっていると言える。

2 モデル比較

複数のモデルを比較することを考える。まずモデルを5つ作る。

house2 <- lm(ValueperSqFt ~ Units * SqFt + Boro , data = housing)
house3 <- lm(ValueperSqFt ~ Units + SqFt * Boro + Class ,
data = housing)
house4 <- lm(ValueperSqFt ~ Units + SqFt * Boro + SqFt * Class ,
data = housing)
house5 <- lm(ValueperSqFt ~ Boro + Class , data = housing)
multiplot(house1,house2,house3,house4,house5)

Boro変数が効いていることが分かる。

AIC, BICを計算する。

AIC(house1, house2, house3, house4, house5)
##        df      AIC
## house1  8 27177.78
## house2  9 27163.82
## house3 15 27025.04
## house4 18 27001.69
## house5  9 27189.50
BIC(house1, house2, house3, house4, house5)
##        df      BIC
## house1  8 27224.75
## house2  9 27216.66
## house3 15 27113.11
## house4 18 27107.37
## house5  9 27242.34

どちらもhouse4モデルがベストだと言っている。

3 Cross validation

K-fold CVをする。{boot}はglmに対してしかK-fold CVできないので、lmを使って出来ることをわざわざglmでフィットしないといけない。

houseG1 <- glm(ValueperSqFt ~ Units + SqFt + Boro ,
 data = housing , family = gaussian(link = "identity"))
require(boot)
## Loading required package: boot
## 
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
## 
##     aml
## The following object is masked from 'package:lattice':
## 
##     melanoma
houseCV1 <- cv.glm(housing , houseG1 , K = 5)
houseCV1$delta
## [1] 1877.944 1876.120

1番目がK-fold CVの誤差評価で2番目は補正をしてLOO-CVに近づけたものである。

いくつかモデルを使って比較をしてみる。

# glmを使い再度モデルを再適合
houseG2 <- glm(ValueperSqFt ~ Units * SqFt + Boro , data = housing)
houseG3 <- glm(ValueperSqFt ~ Units + SqFt * Boro + Class ,
data = housing)
houseG4 <- glm(ValueperSqFt ~ Units + SqFt * Boro + SqFt * Class ,
data = housing)
houseG5 <- glm(ValueperSqFt ~ Boro + Class , data = housing)

# クロスバリデーションを実行
houseCV2 <- cv.glm(housing , houseG2 , K = 5)
houseCV3 <- cv.glm(housing , houseG3 , K = 5)
houseCV4 <- cv.glm(housing , houseG4 , K = 5)
houseCV5 <- cv.glm(housing , houseG5 , K = 5)

## 誤差結果を確認
# 結果のdata.frameを作成
cvResults <- as.data.frame(rbind(houseCV1$delta , houseCV2$delta ,
houseCV3$delta , houseCV4$delta ,houseCV5$delta))
# より見やすいようにいくつか修正
# 列名を修正
names(cvResults) <- c("Error" , "Adjustted.Error")
# モデル名を追加
cvResults$Model <- sprintf("houseG%s" , 1:5)
# 結果を確認
cvResults
##      Error Adjustted.Error   Model
## 1 1877.944        1876.120 houseG1
## 2 1859.502        1858.471 houseG2
## 3 1761.561        1759.902 houseG3
## 4 1745.004        1742.999 houseG4
## 5 1885.753        1883.827 houseG5

3.1 Bootstrap

3.2 stepwise変数選択法