このページでは、株式会社インサイト・ファクトリー 2020年度社内研修「マーケティング・ミックス・モデリング」III章「回帰分析の基礎」のRコードを公開しています。

準備

library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.0     √ purrr   0.3.3
## √ tibble  3.0.0     √ dplyr   0.8.5
## √ tidyr   1.0.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(assertr)
library(patchwork)
library(rgl)
windowsFonts(MeiryoUI = "Meiryo UI")

### サンプルデータの読込
# 読込
dfMansion <- read_csv("./Mansion.csv")
## Parsed with column specification:
## cols(
##   No = col_double(),
##   Hirosa = col_double(),
##   Nensu = col_double(),
##   Price = col_double()
## )

Code 1. 中古マンションの価格

# 散布図
g1 <- ggplot(data = dfMansion, aes(x = Hirosa, y = Price)) +
      labs(x = "広さ(平方m)", y = "価格(百万円)")
g2 <- ggplot(data = dfMansion, aes(x = Nensu, y = Price)) + 
      labs(x = "築年数", y = "価格(百万円)")
g3 <- ggplot(data = dfMansion, aes(x = Hirosa, y = Nensu)) + 
      labs(x = "広さ", y = "築年数")
# patchworkパッケージによってチャートを結合
g <- (g1 + g2 + g3) 
# patchworkパッケージでは、結合したチャートを一括して修正できる
g <- g & geom_point(size = 4) 
g <- g & theme_bw(base_family = "MeiryoUI")
print(g)

# 平均
summary(dfMansion)
##        No            Hirosa          Nensu           Price      
##  Min.   : 1.00   Min.   :38.00   Min.   : 1.00   Min.   :30.00  
##  1st Qu.: 3.25   1st Qu.:51.50   1st Qu.: 4.00   1st Qu.:34.50  
##  Median : 5.50   Median :60.00   Median : 5.00   Median :44.50  
##  Mean   : 5.50   Mean   :60.40   Mean   : 8.60   Mean   :43.60  
##  3rd Qu.: 7.75   3rd Qu.:71.25   3rd Qu.:14.75   3rd Qu.:51.75  
##  Max.   :10.00   Max.   :77.00   Max.   :22.00   Max.   :60.00
# 相関
cor(dfMansion %>% dplyr::select(Price, Hirosa, Nensu))
##             Price    Hirosa      Nensu
## Price   1.0000000 0.7917410 -0.5375242
## Hirosa  0.7917410 1.0000000  0.0367834
## Nensu  -0.5375242 0.0367834  1.0000000

Code 2. 最小二乗法のシミュレーション(2事例)

# デモンストレーション用の仮のパラメータを定義
lParam <- list(
  cand1 = list(a = 8,  b = 0.55),
  cand2 = list(a = -2, b = 0.8)
)
# 散布図に仮の直線を載せる
g <- ggplot(data = dfMansion, aes(x = Hirosa, y = Price))
g <- g + geom_point(size = 4)
g <- g + geom_abline(intercept = lParam$cand1$a, slope = lParam$cand1$b)
g <- g + geom_abline(intercept = lParam$cand2$a, slope = lParam$cand2$b)
g <- g + labs(x = "広さ(平方m)", y = "価格(百万円)")
g <- g + coord_cartesian(xlim = c(0, 100), ylim = c(0, 100))
g <- g + theme_bw(base_family = "MeiryoUI")
print(g)

# 残差二乗和の計算例
out <- lParam %>%
  # purrr::map_dfr() は、リストを受け取り、リストの各要素にたいして
  # 関数 .f を適用し、その結果得られるデータフレームに要素名を持つ
  # 列(列名は.id)を追加し、縦に積んで返す
  map_dfr(
    .f = function(lCurrent){
      dfMansion %>%
        dplyr::select(No, Hirosa, Price) %>%
        mutate(gPred = lCurrent$a + Hirosa * lCurrent$b)
    }, 
    .id = "sID"
  ) %>%
  # 残差二乗を求める
  mutate(gSQError = (Price - gPred)^2) %>%
  # 予測値と残差二乗をlongに
  gather(varname, value, c(gPred, gSQError)) %>%
  # 変数名を作成する
  mutate(
    varname = paste0(varname, sub("^cand", "", sID))
  ) %>%
  # wideに
  dplyr::select(-sID) %>%
  spread(varname, value)
# 出力
print(out)
## # A tibble: 10 x 7
##       No Hirosa Price gPred1 gPred2 gSQError1 gSQError2
##    <dbl>  <dbl> <dbl>  <dbl>  <dbl>     <dbl>     <dbl>
##  1     1     51    30   36.0   38.8     36.6    77.4   
##  2     2     38    32   28.9   28.4      9.61   13.0   
##  3     3     57    33   39.4   43.6     40.3   112.    
##  4     4     51    39   36.0   38.8      8.70    0.0400
##  5     5     53    44   37.2   40.4     46.9    13.0   
##  6     6     77    45   50.4   59.6     28.6   213.    
##  7     7     63    45   42.7   48.4      5.52   11.6   
##  8     8     69    54   46.0   53.2     64.8     0.640 
##  9     9     72    54   47.6   55.6     41.0     2.56  
## 10    10     73    60   48.2   56.4    140.     13.0
write.csv(out, "./chap3_code2.csv", row.names = FALSE)

Code 3. 最小二乗法のシミュレーション(グリッドサーチ)

# sub_SQError(): パラメータをもらって残差平方和を返す関数
sub_SQError <- function(gA,gB){
  agPred <- gA + gB * dfMansion$Hirosa
  sum( (dfMansion$Price - agPred)^2 )
}
# sub_SQErrors(): パラメータのベクトルをもらって
# 要素ごとにsub_SQError()をコールする関数
sub_SQErrors <- function(agA, agB){
  stopifnot(length(agA) == length(agB))
  sapply(seq_along(agA), function(nID) sub_SQError(agA[nID], agB[nID]))
}
# 描画
agX <- seq(-2, 10,  length = 50)
agY <- seq(0.4, 1,  length = 50)
mgZ <- outer(agX, agY, sub_SQErrors)
persp(
  agX, agY, mgZ,
  ticktype = "detailed", expand = 0.5, theta = -20, phi = 45,
  xlab = "a", ylab = "b", zlab = ""
)

Code 4. 回帰分析

# 関数を使わずに
x <- dfMansion$Hirosa
y <- dfMansion$Price
b <- sum( (x - mean(x))*(y - mean(y))) / sum( (x - mean(x))^2 )
a <- mean(y) - b * mean(x)
print(a)
## [1] 4.286288
print(b)
## [1] 0.6508893
# 関数を使って
summary(lm(Price ~ Hirosa, data = dfMansion))
## 
## Call:
## lm(formula = Price ~ Hirosa, data = dfMansion)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.405 -5.684  2.184  4.347  8.199 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   4.2863    10.9270   0.392  0.70511   
## Hirosa        0.6509     0.1775   3.666  0.00635 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.63 on 8 degrees of freedom
## Multiple R-squared:  0.6269, Adjusted R-squared:  0.5802 
## F-statistic: 13.44 on 1 and 8 DF,  p-value: 0.006346

Code 5. 散布図(広さ vs 家賃)

g <- ggplot(data = dfMansion, aes(x = Hirosa, y = Price))
g <- g + geom_point(size = 4)
g <- g + geom_smooth(method = "lm", color = "red", se = FALSE)
g <- g + labs(x = "広さ(平方m)", y = "価格(百万円)")
g <- g + theme_bw(base_family = "MeiryoUI")
print(g)
## `geom_smooth()` using formula 'y ~ x'

Code 6. 正規密度の計算例

1/(sqrt(2*pi) * 2) * exp(-(12 - 10)^2 / (2*4))
## [1] 0.1209854
dnorm(12, 10, 2)
## [1] 0.1209854
dnorm(12, 10, 2) * dnorm(8, 10, 2) * dnorm(9, 10, 2)
## [1] 0.002576671

Code 7. 最尤法のシミュレーション (2事例)

# map_dfr()についてはCode 2を参照
out <- lParam %>%
  map_dfr(
    .f = function(lCurrent){
      dfMansion %>%
        dplyr::select(No, Hirosa, Price) %>%
        mutate(gPred = lCurrent$a + Hirosa * lCurrent$b)
    }, 
    .id = "sID"
  ) %>%
  mutate( gL = dnorm(Price, gPred, 5)) %>%
  gather(varname, value, c(gPred, gL)) %>%
  mutate(varname = paste0(varname, sub("^cand", "", sID))) %>%
  dplyr::select(-sID) %>%
  spread(varname, value)
print(out)
## # A tibble: 10 x 7
##       No Hirosa Price     gL1     gL2 gPred1 gPred2
##    <dbl>  <dbl> <dbl>   <dbl>   <dbl>  <dbl>  <dbl>
##  1     1     51    30 0.0384  0.0170    36.0   38.8
##  2     2     38    32 0.0658  0.0616    28.9   28.4
##  3     3     57    33 0.0356  0.00843   39.4   43.6
##  4     4     51    39 0.0670  0.0797    36.0   38.8
##  5     5     53    44 0.0312  0.0616    37.2   40.4
##  6     6     77    45 0.0450  0.00112   50.4   59.6
##  7     7     63    45 0.0714  0.0633    42.7   48.4
##  8     8     69    54 0.0218  0.0788    46.0   53.2
##  9     9     72    54 0.0352  0.0758    47.6   55.6
## 10    10     73    60 0.00481 0.0616    48.2   56.4
write.csv(out, "./chap3_code7.csv", row.names = FALSE)

Code 8. 最尤法のシミュレーション(グリッドサーチ)

# sub_Likelihood(): パラメータをもらって尤度を返す関数
sub_Likelihood <- function(gA,gB){
  agPred <- gA + gB * dfMansion$Hirosa
  exp(sum(log(dnorm(dfMansion$Price, agPred, 5))))
}
# sub_Likelihoods(): パラメータのベクトルをもらって
# 要素ごとにsub_Likelihood()をコールする関数
sub_Likelihoods <- function(agA, agB){
  stopifnot(length(agA) == length(agB))
  sapply(seq_along(agA), function(nID) sub_Likelihood(agA[nID], agB[nID]))
}
# 描画
agX <- seq(-2, 10,  length = 50)
agY <- seq(0.4, 1,  length = 50)
mgZ <- outer(agX, agY, sub_Likelihoods)
persp(
  agX, agY, mgZ,
  ticktype = "detailed", expand = 0.5, theta = -20, phi = 45,
  xlab = "a", ylab = "b", zlab = ""
)

Code 9. 平方和の分解

agPredict <- predict(lm(Price ~ Hirosa, data = dfMansion))
a <- sum((dfMansion$Price - mean(dfMansion$Price))^2)
b <- sum((agPredict - mean(dfMansion$Price))^2)
c <- sum((dfMansion$Price - agPredict)^2)
print(a)
## [1] 942.4
print(b)
## [1] 590.7471
print(c)
## [1] 351.6529
print(b/a)
## [1] 0.6268539

Code 10. 重回帰のパラメータ推定

# 関数を使わずに
X <- dfMansion %>%
  mutate(Intercept = 1) %>%
  dplyr::select(Intercept, Hirosa, Nensu) %>%
  as.matrix(.)
Y <- dfMansion %>%
  dplyr::select(Price) %>%
  as.matrix(.)
betahat <- solve(t(X) %*% X) %*% t(X) %*% Y
print(betahat)
##                Price
## Intercept 10.2012955
## Hirosa     0.6680477
## Nensu     -0.8082993
# 関数を使って
oModel <- lm(Price ~ Hirosa + Nensu, data = dfMansion)
agCoef <- coef(oModel)
summary(oModel)
## 
## Call:
## lm(formula = Price ~ Hirosa + Nensu, data = dfMansion)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.2468 -2.0952  0.3939  1.7150  3.6196 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.20130    4.43624   2.300 0.055029 .  
## Hirosa       0.66805    0.07065   9.456 3.09e-05 ***
## Nensu       -0.80830    0.12241  -6.603 0.000303 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.636 on 7 degrees of freedom
## Multiple R-squared:  0.9484, Adjusted R-squared:  0.9336 
## F-statistic:  64.3 on 2 and 7 DF,  p-value: 3.126e-05
# チャート
plot3d(
  dfMansion[c("Hirosa", "Nensu", "Price")], 
  size = 5, box = FALSE, axis = FALSE, type = "p"
)
plot3d(
  dfMansion[c("Hirosa", "Nensu", "Price")], 
  size = 5, box = FALSE, axis = FALSE, type = "h", add = TRUE
)
planes3d(agCoef[2], agCoef[3], -1, agCoef[1], col = "blue", alpha = 0.2)

Code 11. 最大対数尤度とAIC

# 関数を使わずに
X <- dfMansion %>%
  mutate(Intercept = 1) %>%
  dplyr::select(Intercept, Hirosa, Nensu) %>%
  as.matrix(.)
Y <- dfMansion %>%
  dplyr::select(Price) %>%
  as.matrix(.)
betahat <- solve(t(X) %*% X) %*% t(X) %*% Y
ehat <- Y - X %*% betahat
sshat <- as.vector((1/length(Y)) * t(ehat) %*% ehat)
LL <- -(length(Y)/2) * (log(sshat) + 1 + log(2 * pi))
AIC <- -2 * LL + 2 * 4 
print(LL)
## [1] -22.09959
print(AIC)
## [1] 52.19917
# 関数を使って
oModel <- lm(Price ~ Hirosa + Nensu, data = dfMansion)
print(logLik(oModel))
## 'log Lik.' -22.09959 (df=4)
print(AIC(oModel))
## [1] 52.19917

以上