Abstract
みんなのR、17章のお勉強。require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
housing <- read.table ("http://www.jaredlander.com/data/housing.csv",
sep = ",", header = TRUE,
stringsAsFactors = FALSE)
# わかりやすい列名に変更
names(housing) <- c("Neighborhood" , "Class" , "Units" , "YearBuilt" ,
"SqFt" , "Income" , "IncomeperSqFt" , "Expence" ,
"ExpencePerSqFt" , "NetIncome" , "Value" ,
"ValueperSqFt" , "Boro")
# 外れ値を除去
housing <- housing[housing$Units < 1000 ,]
head(housing)
## Neighborhood Class Units YearBuilt SqFt Income
## 1 FINANCIAL R9-CONDOMINIUM 42 1920 36500 1332615
## 2 FINANCIAL R4-CONDOMINIUM 78 1985 126420 6633257
## 3 FINANCIAL RR-CONDOMINIUM 500 NA 554174 17310000
## 4 FINANCIAL R4-CONDOMINIUM 282 1930 249076 11776313
## 5 TRIBECA R4-CONDOMINIUM 239 1985 219495 10004582
## 6 TRIBECA R4-CONDOMINIUM 133 1986 139719 5127687
## IncomeperSqFt Expence ExpencePerSqFt NetIncome Value ValueperSqFt
## 1 36.51 342005 9.37 990610 7300000 200.00
## 2 52.47 1762295 13.94 4870962 30690000 242.76
## 3 31.24 3543000 6.39 13767000 90970000 164.15
## 4 47.28 2784670 11.18 8991643 67556006 271.23
## 5 45.58 2783197 12.68 7221385 54320996 247.48
## 6 36.70 1497788 10.72 3629899 26737996 191.37
## Boro
## 1 Manhattan
## 2 Manhattan
## 3 Manhattan
## 4 Manhattan
## 5 Manhattan
## 6 Manhattan
house1 <- lm(ValueperSqFt ~ Units + SqFt + Boro , data = housing)
summary(house1)
##
## Call:
## lm(formula = ValueperSqFt ~ Units + SqFt + Boro, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.458 -22.680 1.493 26.290 261.761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.430e+01 5.342e+00 8.293 < 2e-16 ***
## Units -1.532e-01 2.421e-02 -6.330 2.88e-10 ***
## SqFt 2.070e-04 2.129e-05 9.723 < 2e-16 ***
## BoroBrooklyn 3.258e+01 5.561e+00 5.858 5.28e-09 ***
## BoroManhattan 1.274e+02 5.459e+00 23.343 < 2e-16 ***
## BoroQueens 3.011e+01 5.711e+00 5.272 1.46e-07 ***
## BoroStaten Island -7.114e+00 1.001e+01 -0.711 0.477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.2 on 2613 degrees of freedom
## Multiple R-squared: 0.6034, Adjusted R-squared: 0.6025
## F-statistic: 662.6 on 6 and 2613 DF, p-value: < 2.2e-16
coefplot(house1)
回帰の結果を詳細に分析するには
broomパッケージを使う。
require(broom)
## Loading required package: broom
head(augment(house1))
## .rownames ValueperSqFt Units SqFt Boro .fitted .se.fit
## 1 1 200.00 42 36500 Manhattan 172.8475 1.338282
## 2 2 242.76 78 126420 Manhattan 185.9418 1.312763
## 3 3 164.15 500 554174 Manhattan 209.8077 4.095022
## 4 4 271.23 282 249076 Manhattan 180.0672 2.562165
## 5 5 247.48 239 219495 Manhattan 180.5341 2.110663
## 6 6 191.37 133 139719 Manhattan 180.2661 1.291444
## .resid .hat .sigma .cooksd .std.resid
## 1 27.15248 0.0009594821 43.20952 5.424169e-05 0.6287655
## 2 56.81815 0.0009232393 43.19848 2.285253e-04 1.3157048
## 3 -45.65775 0.0089836758 43.20347 1.459368e-03 -1.0615607
## 4 91.16278 0.0035168641 43.17583 2.252653e-03 2.1137487
## 5 66.94589 0.0023865978 43.19289 8.225193e-04 1.5513636
## 6 11.10385 0.0008934957 43.21225 8.446170e-06 0.2571216
ggplot(aes(x = .fitted , y = .resid) , data = house1) +
geom_point(aes(color = Boro)) +
geom_hline(yintercept = 0) +
geom_smooth(se = FALSE) +
labs(x = "Fitted Value" , y = "Residuals")
## Q-Q plot
Q-Q plotは理論値とデータがどれくらい一致するかを調べるプロット。Q-Q plotの説明はこのサイトが分かりやすい。
ggplot(house1 , aes(sample = .stdresid)) + stat_qq() + geom_abline()
フィットがよければ直線上に乗る。
ggplot(house1 , aes(x = .resid)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ガウシアンに見えるのでフィットはうまくいっていると言える。
複数のモデルを比較することを考える。まずモデルを5つ作る。
house2 <- lm(ValueperSqFt ~ Units * SqFt + Boro , data = housing)
house3 <- lm(ValueperSqFt ~ Units + SqFt * Boro + Class ,
data = housing)
house4 <- lm(ValueperSqFt ~ Units + SqFt * Boro + SqFt * Class ,
data = housing)
house5 <- lm(ValueperSqFt ~ Boro + Class , data = housing)
multiplot(house1,house2,house3,house4,house5)
Boro変数が効いていることが分かる。
AIC, BICを計算する。
AIC(house1, house2, house3, house4, house5)
## df AIC
## house1 8 27177.78
## house2 9 27163.82
## house3 15 27025.04
## house4 18 27001.69
## house5 9 27189.50
BIC(house1, house2, house3, house4, house5)
## df BIC
## house1 8 27224.75
## house2 9 27216.66
## house3 15 27113.11
## house4 18 27107.37
## house5 9 27242.34
どちらもhouse4モデルがベストだと言っている。
K-fold CVをする。{boot}はglmに対してしかK-fold CVできないので、lmを使って出来ることをわざわざglmでフィットしないといけない。
houseG1 <- glm(ValueperSqFt ~ Units + SqFt + Boro ,
data = housing , family = gaussian(link = "identity"))
require(boot)
## Loading required package: boot
##
## Attaching package: 'boot'
## The following object is masked from 'package:survival':
##
## aml
## The following object is masked from 'package:lattice':
##
## melanoma
houseCV1 <- cv.glm(housing , houseG1 , K = 5)
houseCV1$delta
## [1] 1877.944 1876.120
1番目がK-fold CVの誤差評価で2番目は補正をしてLOO-CVに近づけたものである。
いくつかモデルを使って比較をしてみる。
# glmを使い再度モデルを再適合
houseG2 <- glm(ValueperSqFt ~ Units * SqFt + Boro , data = housing)
houseG3 <- glm(ValueperSqFt ~ Units + SqFt * Boro + Class ,
data = housing)
houseG4 <- glm(ValueperSqFt ~ Units + SqFt * Boro + SqFt * Class ,
data = housing)
houseG5 <- glm(ValueperSqFt ~ Boro + Class , data = housing)
# クロスバリデーションを実行
houseCV2 <- cv.glm(housing , houseG2 , K = 5)
houseCV3 <- cv.glm(housing , houseG3 , K = 5)
houseCV4 <- cv.glm(housing , houseG4 , K = 5)
houseCV5 <- cv.glm(housing , houseG5 , K = 5)
## 誤差結果を確認
# 結果のdata.frameを作成
cvResults <- as.data.frame(rbind(houseCV1$delta , houseCV2$delta ,
houseCV3$delta , houseCV4$delta ,houseCV5$delta))
# より見やすいようにいくつか修正
# 列名を修正
names(cvResults) <- c("Error" , "Adjustted.Error")
# モデル名を追加
cvResults$Model <- sprintf("houseG%s" , 1:5)
# 結果を確認
cvResults
## Error Adjustted.Error Model
## 1 1877.944 1876.120 houseG1
## 2 1859.502 1858.471 houseG2
## 3 1761.561 1759.902 houseG3
## 4 1745.004 1742.999 houseG4
## 5 1885.753 1883.827 houseG5