国立環境研究所(追記:その後統数研に異動)の村上大輔さんが作成したパッケージ spmoran を,(1) exampleのBoston住宅価格データの分析をreproduce,(2) H28年度東京都公示地価データでreplicateする.
曰く「空間的自己相関を考慮した回帰係数の推定や、場所的に変化する回帰係数の推定を、高速に行うことのできるパッケージ」とのこと.
# install.packages("spmoran")
library(spmoran)
## Warning: package 'spmoran' was built under R version 3.3.3
# citation("spmoran")
library(ggplot2)
library(spdep)
## Loading required package: sp
## Loading required package: Matrix
data(boston)
nrow(boston.c) # sample size (cross-sectional data)
## [1] 506
head(boston.c, 2)
## TOWN TOWNNO TRACT LON LAT MEDV CMEDV CRIM ZN INDUS CHAS
## 1 Nahant 0 2011 -70.955 42.2550 24.0 24.0 0.00632 18 2.31 0
## 2 Swampscott 1 2021 -70.950 42.2875 21.6 21.6 0.02731 0 7.07 0
## NOX RM AGE DIS RAD TAX PTRATIO B LSTAT
## 1 0.538 6.575 65.2 4.0900 1 296 15.3 396.9 4.98
## 2 0.469 6.421 78.9 4.9671 2 242 17.8 396.9 9.14
以降はexampleとvignetteを真似る.実際には緯度・経度を(距離計算のため)平面直角座標系に変換する必要があるが,ちょっと手を抜く.
boston_coords <- boston.c[, c("LON", "LAT")]
plot(boston_coords) # 空間分布
ggplotの場合(価格の中央値 CMEDV でグラデーションを付ける). 「coord_quickmap()」はなくても描画自体はできるが,指定しないとアスペクト比が崩れる.
ggplot(data = boston.c, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = CMEDV)) +
coord_quickmap()
threshold引数を与えない場合(デフォルト設定),正の固有値に対応する固有ベクトルが抽出される.
boston_meigen <- spmoran::meigen(coords = boston_coords) # extraction of Moran’s eigenvectors
## 55/506 eigen-pairs are extracted
plot(boston_meigen$ev) # 固有値
固有ベクトルの空間分布描画のための下準備.第1・第2・第5・第10固有ベクトルの4つ.
boston_sf1_col <- cut(boston_meigen$sf[, 1], 10) # 描画の色付けのために第一固有ベクトルを等幅で分割
boston_sf2_col <- cut(boston_meigen$sf[, 2], 10) # 2nd
boston_sf5_col <- cut(boston_meigen$sf[, 5], 10) # 5th
boston_sf10_col <- cut(boston_meigen$sf[, 10], 10) # 10th
rb10 <- rainbow(n = 10, start = .7, end = .1) # レインボーカラーの設定
描画.
par(mfrow = c(2, 2), mai = rep(0, 4))
# 第1固有ベクトル(左上)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf1_col])
legend(x = "topright", legend = levels(boston_sf1_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "1st", bty = "n", pch = "")
# 第2固有ベクトル(右上)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf2_col])
legend(x = "topright", legend = levels(boston_sf2_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "2nd", bty = "n", pch = "")
# 第5固有ベクトル(左下)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf5_col])
legend(x = "topright", legend = levels(boston_sf5_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "5th", bty = "n", pch = "")
# 第10固有ベクトル(右下)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf10_col])
legend(x = "topright", legend = levels(boston_sf10_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "10th", bty = "n", pch = "")
boston_ols <- lm(formula = log(CMEDV) ~ CRIM + NOX, data = boston.c)
summary(boston_ols)
##
## Call:
## lm(formula = log(CMEDV) ~ CRIM + NOX, data = boston.c)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.08324 -0.18885 -0.05174 0.16442 1.10259
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.789578 0.074287 51.013 <2e-16 ***
## CRIM -0.018106 0.001832 -9.882 <2e-16 ***
## NOX -1.243193 0.136003 -9.141 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3212 on 503 degrees of freedom
## Multiple R-squared: 0.3833, Adjusted R-squared: 0.3809
## F-statistic: 156.3 on 2 and 503 DF, p-value: < 2.2e-16
残差プロット.系統的なパターンが確認できる.
boston.c$resid_ols <- residuals(boston_ols)
ggplot(data = boston.c, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = resid_ols)) +
coord_quickmap()
boston_x <- boston.c[, c("CRIM", "NOX")] # 説明変数行列
boston_esf <- spmoran::esf(y = log(boston.c$CMEDV), x = boston_x, meig = boston_meigen)
## 38/55 eigenvectors are selected
boston_esf$b # estimates
## Estimate SE t_value p_value
## (Intercept) 4.47874555 0.117750586 38.035866 7.576872e-145
## CRIM -0.01192241 0.001579564 -7.547909 2.353599e-13
## NOX -2.52590252 0.211723195 -11.930212 8.253279e-29
head(boston_esf$vif) # variance inflation factor
## VIF
## CRIM 1.932361
## NOX 6.300820
## sf2 1.337738
## sf35 1.002090
## sf27 1.034274
## sf24 1.055055
boston_esf$e # error statistics
## stat
## resid_SE 0.2196420
## adjR2 0.7105819
## logLik 70.3681474
## AIC -56.7362947
## BIC 120.7782454
残差プロット.
boston.c$resid_esf <- boston_esf$resid
ggplot(data = boston.c, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = resid_esf)) +
coord_quickmap()
boston_esf <- spmoran::resf(y = log(boston.c$CMEDV), x = boston_x, meig = boston_meigen)
## RE-ESF model fit by REML
boston_esf$b # estimates
## Estimate SE t_value p_value
## (Intercept) 4.35122179 0.122169487 35.616273 0.000000e+00
## CRIM -0.01216329 0.001523113 -7.985807 1.154632e-14
## NOX -2.29443448 0.220311367 -10.414508 0.000000e+00
boston_esf$e # error stat(s)
## stat
## resid_SE 0.2116215
## adjR2(cond) 0.7286355
## rlogLik -23.8780504
## AIC 61.7561009
## BIC 91.3418575
boston_esf$s # shrinkage param(s)
## par
## shrink_sf_SE 0.3299662
## shrink_sf_alpha 0.6026645
boston_x_vary <- boston.c$NOX # X with spatially varying coefficients
boston_x_const <- boston.c$CRIM # X with constant coefficients
boston_re_esf <- resf_vc(y = log(boston.c$CMEDV), x = boston_x_vary, xconst = boston_x_const, meig = boston_meigen)
## RE-ESF model fit by REML
head(boston_re_esf$b_vc) # (const + NOX) estimates
## (Intercept) V1
## 1 4.276740 -2.294435
## 2 4.211276 -2.294435
## 3 4.379810 -2.294435
## 4 4.428908 -2.294435
## 5 4.424535 -2.294435
## 6 4.382977 -2.294435
summary(boston_re_esf$b_vc)
## (Intercept) V1
## Min. :3.928 Min. :-2.294
## 1st Qu.:4.167 1st Qu.:-2.294
## Median :4.313 Median :-2.294
## Mean :4.351 Mean :-2.294
## 3rd Qu.:4.496 3rd Qu.:-2.294
## Max. :5.094 Max. :-2.294
summary(boston_re_esf$p_vc)
## (Intercept) V1
## Min. :0 Min. :0
## 1st Qu.:0 1st Qu.:0
## Median :0 Median :0
## Mean :0 Mean :0
## 3rd Qu.:0 3rd Qu.:0
## Max. :0 Max. :0
boston_re_esf$b # CRIM estimates
## Estimate SE t_value p_value
## V1 -0.01216338 0.001523118 -7.985844 1.154632e-14
boston_re_esf$e # error statistics
## stat
## resid_SE 0.2116227
## adjR2(cond) 0.7286322
## rlogLik -23.8780504
## AIC 61.7561009
## BIC 91.3418575
空間分布の可視化(定数項とNOX).
boston.c2 <- cbind(boston.c,
const = boston_re_esf$b_vc[, 1], nox_coef = boston_re_esf$b_vc[, 2],
const_p = boston_re_esf$p_vc[, 1], nox_p = boston_re_esf$p_vc[, 2])
ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = const)) +
coord_quickmap()
ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = nox_coef)) +
coord_quickmap()
ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = const_p)) + # 全地点0
coord_quickmap()
ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = nox_p)) + # 全地点0
coord_quickmap()
setwd(dir = "c://hoge")
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(curl)
MLIT・国土数値情報 http://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-L01-v2_3.html からDLする.
lp28_path <- "http://nlftp.mlit.go.jp/ksj/gml/data/L01/L01-16/L01-16_13_GML.zip"
f <- tempfile()
curl::curl_download(url = lp28_path, destfile = f) # .zipでDL
unzip(zipfile = f, exdir = getwd())
unlink(f); rm(f)
これからはsfパッケージが主流になるとの説があるため,RでGISをやるときにはsfパッケージ、という世の中になるらしい。 や sfパッケージを用いたデータの読み込みから可視化まで を参照しながら試す.
library(sf)
## Linking to GEOS 3.5.0, GDAL 2.1.1
lp28 <- sf::st_read(dsn = getwd(), layer = "L01-16_13", quiet = T, stringsAsFactors = F)
lp28_sub <- lp28 %>%
mutate(lp = as.numeric(L01_006)) %>% # 地価 [円/m2]
filter(lp >= 1e5, lp <= 2e6) %>% # 10万円以上200万円以下
mutate(lnlp = log(lp)) %>% # 対数変換
mutate(chiseki = as.numeric(L01_020)) %>% # 地積 [m2]
filter(chiseki <= 3e2) %>% # 300m2以下のみ
mutate(station = as.numeric(L01_042)) %>% # 最寄駅までの距離 [m]
filter(station <= 2e3) %>% # 2km以下のみ
mutate(FAratio = as.numeric(L01_045)) # 容積率 [%]
# class(lp28_sub) # sfとdata.frameの両方の型が保持されている
nrow(lp28_sub) # sample size
## [1] 1855
ggplotで表示するために緯度経度をデータフレームに追加しておく(緯度経度情報の抽出はnozmaさんの「RとRcppで緯度経度から都道府県と行政区域を高速に検索(したかった)」を参照した).
lp28_sub <- cbind(t(matrix(unlist(lp28_sub$geometry), nrow = 2)), lp28_sub)
# 座標行列は matrix(unlist(lp28_sub$geometry), ncol = 2, byrow = T) でもよい
colnames(lp28_sub)[1:2] <- c("LON", "LAT")
対数地価の空間分布図.
# without ggplot (非推奨)
plot(lp28_sub, col = round(lp28_sub$lnlp), asp = 1, pch = 20, cex = .5)
# with ggplot (推奨)
ggplot(data = lp28_sub, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = lnlp)) +
scale_colour_gradientn(colours = terrain.colors(20)) +
coord_quickmap()
lp28_ols <- lm(formula = lnlp ~ chiseki + station + FAratio, data = lp28_sub)
summary(lp28_ols)
##
## Call:
## lm(formula = lnlp ~ chiseki + station + FAratio, data = lp28_sub)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.85191 -0.27729 0.00201 0.28888 1.31974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.247e+01 4.491e-02 277.671 < 2e-16 ***
## chiseki 8.726e-04 1.922e-04 4.539 6.01e-06 ***
## station -3.753e-04 2.468e-05 -15.206 < 2e-16 ***
## FAratio 2.132e-03 7.329e-05 29.084 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4231 on 1851 degrees of freedom
## Multiple R-squared: 0.5313, Adjusted R-squared: 0.5306
## F-statistic: 699.5 on 3 and 1851 DF, p-value: < 2.2e-16
lp28_sub$ols_resid <- residuals(object = lp28_ols)
ggplot(data = lp28_sub, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = ols_resid)) +
scale_colour_gradient2(low = "blue", high = "red") + # 残差分布
coord_quickmap()
stオブジェクトの座標取得はMichalis Pavlis氏blog postを参考にしているが,対応する関数はないのだろうか…
lp28_x <- lp28_sub[, c("station", "chiseki", "FAratio")] # X
lp28_coords <- do.call(rbind, unclass(st_geometry(lp28_sub)))
system.time(lp28_meigen <- spmoran::meigen(coords = lp28_coords)) # 固有ベクトル
## 246/1855 eigen-pairs are extracted
## user system elapsed
## 13.13 0.19 14.05
system.time(lp28_esf <- spmoran::esf(y = lp28_sub$lnlp, x = lp28_x, meig = lp28_meigen))
## [1] 50
## [1] 100
## [1] 150
## 161/246 eigenvectors are selected
## user system elapsed
## 964.58 1.59 1026.38
lp28_esf$b # estimates
## Estimate SE t_value p_value
## (Intercept) 12.5778492509 1.840425e-02 683.42079 0.000000e+00
## station -0.0001988756 1.063950e-05 -18.69219 5.068386e-71
## chiseki 0.0003321443 7.506441e-05 4.42479 1.027033e-05
## FAratio 0.0015032302 3.662314e-05 41.04592 4.332570e-256
lp28_esf$e # error statistics
## stat
## resid_SE 0.1515581
## adjR2 0.9397560
## logLik 954.2593489
## AIC -1576.5186978
## BIC -659.2624619
残差の空間分布.まばらに分布しているように見える.
lp28_sub$esf_resid <- lp28_esf$resid
ggplot(data = lp28_sub, mapping = aes(x = LON, y = LAT)) +
geom_point(mapping = aes(colour = esf_resid)) +
scale_colour_gradient2(low = "blue", high = "red") + # 残差分布
coord_quickmap()
サンプルサイズ>2kで固有ベクトルの候補が360もあるため,手許の計算環境では spmoran::esf によるstepwise計算に1時間程度かかっている.
そこでパッケージで実装されている高速化(近似計算/変数選択しない)を試してみる.
system.time(lp28_meigen_f <- spmoran::meigen_f(coords = lp28_coords))
## 200/1855 eigen-pairs are approximated
## user system elapsed
## 0.12 0.00 0.14
system.time(lp28_esf_all <- spmoran::esf(y = lp28_sub$lnlp, x = lp28_x, meig = lp28_meigen_f, fn = "all"))
## 200/200 eigenvectors are selected
## user system elapsed
## 0.37 0.00 0.42
lp28_esf_all$b # estimates
## Estimate SE t_value p_value
## (Intercept) 12.5777507577 1.963905e-02 640.445983 0.000000e+00
## station -0.0002044275 1.122568e-05 -18.210699 1.140999e-67
## chiseki 0.0003129565 8.027630e-05 3.898492 1.006682e-04
## FAratio 0.0015328081 3.958263e-05 38.724260 6.082687e-234
lp28_esf_all$e # error statistics
## stat
## resid_SE 0.1598405
## adjR2 0.9329916
## logLik 877.2149517
## AIC -1344.4299034
## BIC -211.6737086
速い.
(以上)