国立環境研究所(追記:その後統数研に異動)の村上大輔さんが作成したパッケージ spmoran を,(1) exampleのBoston住宅価格データの分析をreproduce,(2) H28年度東京都公示地価データでreplicateする.

曰く「空間的自己相関を考慮した回帰係数の推定や、場所的に変化する回帰係数の推定を、高速に行うことのできるパッケージ」とのこと.

Setup

# install.packages("spmoran")
library(spmoran)
## Warning: package 'spmoran' was built under R version 3.3.3
# citation("spmoran")
library(ggplot2)

1. Boston住宅データ

library(spdep)
## Loading required package: sp
## Loading required package: Matrix
data(boston)
nrow(boston.c)  # sample size (cross-sectional data)
## [1] 506
head(boston.c, 2)
##         TOWN TOWNNO TRACT     LON     LAT MEDV CMEDV    CRIM ZN INDUS CHAS
## 1     Nahant      0  2011 -70.955 42.2550 24.0  24.0 0.00632 18  2.31    0
## 2 Swampscott      1  2021 -70.950 42.2875 21.6  21.6 0.02731  0  7.07    0
##     NOX    RM  AGE    DIS RAD TAX PTRATIO     B LSTAT
## 1 0.538 6.575 65.2 4.0900   1 296    15.3 396.9  4.98
## 2 0.469 6.421 78.9 4.9671   2 242    17.8 396.9  9.14

以降はexampleとvignetteを真似る.実際には緯度・経度を(距離計算のため)平面直角座標系に変換する必要があるが,ちょっと手を抜く.

boston_coords <- boston.c[, c("LON", "LAT")]
plot(boston_coords)  # 空間分布

ggplotの場合(価格の中央値 CMEDV でグラデーションを付ける). 「coord_quickmap()」はなくても描画自体はできるが,指定しないとアスペクト比が崩れる.

ggplot(data = boston.c, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = CMEDV)) +
  coord_quickmap()

Moran’s eigenvectors

threshold引数を与えない場合(デフォルト設定),正の固有値に対応する固有ベクトルが抽出される.

boston_meigen <- spmoran::meigen(coords = boston_coords)  # extraction of Moran’s eigenvectors
##  55/506 eigen-pairs are extracted
plot(boston_meigen$ev)  # 固有値

固有ベクトルの空間分布描画のための下準備.第1・第2・第5・第10固有ベクトルの4つ.

boston_sf1_col <- cut(boston_meigen$sf[, 1], 10)  # 描画の色付けのために第一固有ベクトルを等幅で分割
boston_sf2_col <- cut(boston_meigen$sf[, 2], 10)  # 2nd
boston_sf5_col <- cut(boston_meigen$sf[, 5], 10)  # 5th
boston_sf10_col <- cut(boston_meigen$sf[, 10], 10)  # 10th
rb10 <- rainbow(n = 10, start = .7, end = .1)  # レインボーカラーの設定

描画.

par(mfrow = c(2, 2), mai = rep(0, 4))
# 第1固有ベクトル(左上)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf1_col])
legend(x = "topright", legend = levels(boston_sf1_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "1st", bty = "n", pch = "")
# 第2固有ベクトル(右上)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf2_col])
legend(x = "topright", legend = levels(boston_sf2_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "2nd", bty = "n", pch = "")
# 第5固有ベクトル(左下)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf5_col])
legend(x = "topright", legend = levels(boston_sf5_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "5th", bty = "n", pch = "")
# 第10固有ベクトル(右下)
plot(boston_coords, pch = 20, axes = F, col = rb10[boston_sf10_col])
legend(x = "topright", legend = levels(boston_sf10_col), pch = 20, cex = .5, col = rb10)
legend(x = "topleft", legend = "10th", bty = "n", pch = "")

Linear model / 正規線形モデル

boston_ols <- lm(formula = log(CMEDV) ~ CRIM + NOX, data = boston.c)
summary(boston_ols)
## 
## Call:
## lm(formula = log(CMEDV) ~ CRIM + NOX, data = boston.c)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.08324 -0.18885 -0.05174  0.16442  1.10259 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.789578   0.074287  51.013   <2e-16 ***
## CRIM        -0.018106   0.001832  -9.882   <2e-16 ***
## NOX         -1.243193   0.136003  -9.141   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3212 on 503 degrees of freedom
## Multiple R-squared:  0.3833, Adjusted R-squared:  0.3809 
## F-statistic: 156.3 on 2 and 503 DF,  p-value: < 2.2e-16

残差プロット.系統的なパターンが確認できる.

boston.c$resid_ols <- residuals(boston_ols)
ggplot(data = boston.c, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = resid_ols)) +
  coord_quickmap()

ESF / 固有ベクトル空間フィルタリング

boston_x <- boston.c[, c("CRIM", "NOX")]  # 説明変数行列
boston_esf <- spmoran::esf(y = log(boston.c$CMEDV), x = boston_x, meig = boston_meigen)
##   38/55 eigenvectors are selected
boston_esf$b  # estimates
##                Estimate          SE    t_value       p_value
## (Intercept)  4.47874555 0.117750586  38.035866 7.576872e-145
## CRIM        -0.01192241 0.001579564  -7.547909  2.353599e-13
## NOX         -2.52590252 0.211723195 -11.930212  8.253279e-29
head(boston_esf$vif)  # variance inflation factor
##           VIF
## CRIM 1.932361
## NOX  6.300820
## sf2  1.337738
## sf35 1.002090
## sf27 1.034274
## sf24 1.055055
boston_esf$e  # error statistics
##                 stat
## resid_SE   0.2196420
## adjR2      0.7105819
## logLik    70.3681474
## AIC      -56.7362947
## BIC      120.7782454

残差プロット.

boston.c$resid_esf <- boston_esf$resid
ggplot(data = boston.c, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = resid_esf)) +
  coord_quickmap()

Random effects-ESF / 変量効果ESF

boston_esf <- spmoran::resf(y = log(boston.c$CMEDV), x = boston_x, meig = boston_meigen)
##  RE-ESF model fit by REML
boston_esf$b  # estimates
##                Estimate          SE    t_value      p_value
## (Intercept)  4.35122179 0.122169487  35.616273 0.000000e+00
## CRIM        -0.01216329 0.001523113  -7.985807 1.154632e-14
## NOX         -2.29443448 0.220311367 -10.414508 0.000000e+00
boston_esf$e  # error stat(s)
##                    stat
## resid_SE      0.2116215
## adjR2(cond)   0.7286355
## rlogLik     -23.8780504
## AIC          61.7561009
## BIC          91.3418575
boston_esf$s  # shrinkage param(s)
##                       par
## shrink_sf_SE    0.3299662
## shrink_sf_alpha 0.6026645

Random effects-ESF with SVCs / RE-ESFに基づく空間可変パラメータモデル

boston_x_vary <- boston.c$NOX  # X with spatially varying coefficients
boston_x_const <- boston.c$CRIM  # X with constant coefficients
boston_re_esf <- resf_vc(y = log(boston.c$CMEDV), x = boston_x_vary, xconst = boston_x_const, meig = boston_meigen)
##  RE-ESF model fit by REML
head(boston_re_esf$b_vc)  # (const + NOX) estimates
##   (Intercept)        V1
## 1    4.276740 -2.294435
## 2    4.211276 -2.294435
## 3    4.379810 -2.294435
## 4    4.428908 -2.294435
## 5    4.424535 -2.294435
## 6    4.382977 -2.294435
summary(boston_re_esf$b_vc)
##   (Intercept)          V1        
##  Min.   :3.928   Min.   :-2.294  
##  1st Qu.:4.167   1st Qu.:-2.294  
##  Median :4.313   Median :-2.294  
##  Mean   :4.351   Mean   :-2.294  
##  3rd Qu.:4.496   3rd Qu.:-2.294  
##  Max.   :5.094   Max.   :-2.294
summary(boston_re_esf$p_vc)
##   (Intercept)       V1   
##  Min.   :0    Min.   :0  
##  1st Qu.:0    1st Qu.:0  
##  Median :0    Median :0  
##  Mean   :0    Mean   :0  
##  3rd Qu.:0    3rd Qu.:0  
##  Max.   :0    Max.   :0
boston_re_esf$b  # CRIM estimates
##       Estimate          SE   t_value      p_value
## V1 -0.01216338 0.001523118 -7.985844 1.154632e-14
boston_re_esf$e  # error statistics
##                    stat
## resid_SE      0.2116227
## adjR2(cond)   0.7286322
## rlogLik     -23.8780504
## AIC          61.7561009
## BIC          91.3418575

空間分布の可視化(定数項とNOX).

boston.c2 <- cbind(boston.c,
                   const = boston_re_esf$b_vc[, 1], nox_coef = boston_re_esf$b_vc[, 2],
                   const_p = boston_re_esf$p_vc[, 1], nox_p = boston_re_esf$p_vc[, 2])
ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = const)) +
  coord_quickmap()

ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = nox_coef)) +
  coord_quickmap()

ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = const_p)) +  # 全地点0
  coord_quickmap()

ggplot(data = boston.c2, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = nox_p)) +  # 全地点0
  coord_quickmap()

2. H28年度東京都公示地価

setwd(dir = "c://hoge")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(curl)

Download from NLNI

MLIT・国土数値情報 http://nlftp.mlit.go.jp/ksj/gml/datalist/KsjTmplt-L01-v2_3.html からDLする.

lp28_path <- "http://nlftp.mlit.go.jp/ksj/gml/data/L01/L01-16/L01-16_13_GML.zip"
f <- tempfile()
curl::curl_download(url = lp28_path, destfile = f)  # .zipでDL
unzip(zipfile = f, exdir = getwd())
unlink(f); rm(f)

これからはsfパッケージが主流になるとの説があるため,RでGISをやるときにはsfパッケージ、という世の中になるらしい。sfパッケージを用いたデータの読み込みから可視化まで を参照しながら試す.

library(sf)
## Linking to GEOS 3.5.0, GDAL 2.1.1
lp28 <- sf::st_read(dsn = getwd(), layer = "L01-16_13", quiet = T, stringsAsFactors = F)

lp28_sub <- lp28 %>%
  mutate(lp = as.numeric(L01_006)) %>%  # 地価 [円/m2]
  filter(lp >= 1e5, lp <= 2e6) %>%  # 10万円以上200万円以下
  mutate(lnlp = log(lp)) %>%  # 対数変換
  mutate(chiseki = as.numeric(L01_020)) %>%  # 地積 [m2]
  filter(chiseki <= 3e2) %>%  # 300m2以下のみ
  mutate(station = as.numeric(L01_042)) %>%  # 最寄駅までの距離 [m]
  filter(station <= 2e3) %>%  # 2km以下のみ
  mutate(FAratio = as.numeric(L01_045))  # 容積率 [%]
# class(lp28_sub)  # sfとdata.frameの両方の型が保持されている
nrow(lp28_sub)  # sample size
## [1] 1855

ggplotで表示するために緯度経度をデータフレームに追加しておく(緯度経度情報の抽出はnozmaさんの「RとRcppで緯度経度から都道府県と行政区域を高速に検索(したかった)」を参照した).

lp28_sub <- cbind(t(matrix(unlist(lp28_sub$geometry), nrow = 2)), lp28_sub)
 # 座標行列は matrix(unlist(lp28_sub$geometry), ncol = 2, byrow = T) でもよい
colnames(lp28_sub)[1:2] <- c("LON", "LAT")

対数地価の空間分布図.

# without ggplot (非推奨)
plot(lp28_sub, col = round(lp28_sub$lnlp), asp = 1, pch = 20, cex = .5)

# with ggplot (推奨)
ggplot(data = lp28_sub, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = lnlp)) +
  scale_colour_gradientn(colours = terrain.colors(20)) +
  coord_quickmap()

LM as baseline

lp28_ols <- lm(formula = lnlp ~ chiseki + station + FAratio, data = lp28_sub)
summary(lp28_ols)
## 
## Call:
## lm(formula = lnlp ~ chiseki + station + FAratio, data = lp28_sub)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.85191 -0.27729  0.00201  0.28888  1.31974 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.247e+01  4.491e-02 277.671  < 2e-16 ***
## chiseki      8.726e-04  1.922e-04   4.539 6.01e-06 ***
## station     -3.753e-04  2.468e-05 -15.206  < 2e-16 ***
## FAratio      2.132e-03  7.329e-05  29.084  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4231 on 1851 degrees of freedom
## Multiple R-squared:  0.5313, Adjusted R-squared:  0.5306 
## F-statistic: 699.5 on 3 and 1851 DF,  p-value: < 2.2e-16
lp28_sub$ols_resid <- residuals(object = lp28_ols)
ggplot(data = lp28_sub, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = ols_resid)) +
  scale_colour_gradient2(low = "blue", high = "red") +  # 残差分布
  coord_quickmap()

ESF

stオブジェクトの座標取得はMichalis Pavlis氏blog postを参考にしているが,対応する関数はないのだろうか…

lp28_x <- lp28_sub[, c("station", "chiseki", "FAratio")]  # X
lp28_coords <- do.call(rbind, unclass(st_geometry(lp28_sub)))
system.time(lp28_meigen <- spmoran::meigen(coords = lp28_coords))  # 固有ベクトル
##  246/1855 eigen-pairs are extracted
##    user  system elapsed 
##   13.13    0.19   14.05
system.time(lp28_esf <- spmoran::esf(y = lp28_sub$lnlp, x = lp28_x, meig = lp28_meigen))
## [1] 50
## [1] 100
## [1] 150
##   161/246 eigenvectors are selected
##    user  system elapsed 
##  964.58    1.59 1026.38
lp28_esf$b  # estimates
##                  Estimate           SE   t_value       p_value
## (Intercept) 12.5778492509 1.840425e-02 683.42079  0.000000e+00
## station     -0.0001988756 1.063950e-05 -18.69219  5.068386e-71
## chiseki      0.0003321443 7.506441e-05   4.42479  1.027033e-05
## FAratio      0.0015032302 3.662314e-05  41.04592 4.332570e-256
lp28_esf$e  # error statistics
##                   stat
## resid_SE     0.1515581
## adjR2        0.9397560
## logLik     954.2593489
## AIC      -1576.5186978
## BIC       -659.2624619

残差の空間分布.まばらに分布しているように見える.

lp28_sub$esf_resid <- lp28_esf$resid
ggplot(data = lp28_sub, mapping = aes(x = LON, y = LAT)) +
  geom_point(mapping = aes(colour = esf_resid)) +
  scale_colour_gradient2(low = "blue", high = "red") +  # 残差分布
  coord_quickmap()

サンプルサイズ>2kで固有ベクトルの候補が360もあるため,手許の計算環境では spmoran::esf によるstepwise計算に1時間程度かかっている.

そこでパッケージで実装されている高速化(近似計算/変数選択しない)を試してみる.

system.time(lp28_meigen_f <- spmoran::meigen_f(coords = lp28_coords))
##  200/1855 eigen-pairs are approximated
##    user  system elapsed 
##    0.12    0.00    0.14
system.time(lp28_esf_all <- spmoran::esf(y = lp28_sub$lnlp, x = lp28_x, meig = lp28_meigen_f, fn = "all"))
##   200/200 eigenvectors are selected
##    user  system elapsed 
##    0.37    0.00    0.42
lp28_esf_all$b  # estimates
##                  Estimate           SE    t_value       p_value
## (Intercept) 12.5777507577 1.963905e-02 640.445983  0.000000e+00
## station     -0.0002044275 1.122568e-05 -18.210699  1.140999e-67
## chiseki      0.0003129565 8.027630e-05   3.898492  1.006682e-04
## FAratio      0.0015328081 3.958263e-05  38.724260 6.082687e-234
lp28_esf_all$e  # error statistics
##                   stat
## resid_SE     0.1598405
## adjR2        0.9329916
## logLik     877.2149517
## AIC      -1344.4299034
## BIC       -211.6737086

速い.

(以上)