このページでは、株式会社インサイト・ファクトリー 2020年度社内研修「マーケティング・ミックス・モデリング」III章「回帰分析の基礎」のRコードを公開しています。
library(tidyverse)
## -- Attaching packages --------------------------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.0 √ purrr 0.3.3
## √ tibble 3.0.0 √ dplyr 0.8.5
## √ tidyr 1.0.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ------------------------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(assertr)
library(patchwork)
library(rgl)
windowsFonts(MeiryoUI = "Meiryo UI")
### サンプルデータの読込
# 読込
dfMansion <- read_csv("./Mansion.csv")
## Parsed with column specification:
## cols(
## No = col_double(),
## Hirosa = col_double(),
## Nensu = col_double(),
## Price = col_double()
## )
# 散布図
g1 <- ggplot(data = dfMansion, aes(x = Hirosa, y = Price)) +
labs(x = "広さ(平方m)", y = "価格(百万円)")
g2 <- ggplot(data = dfMansion, aes(x = Nensu, y = Price)) +
labs(x = "築年数", y = "価格(百万円)")
g3 <- ggplot(data = dfMansion, aes(x = Hirosa, y = Nensu)) +
labs(x = "広さ", y = "築年数")
# patchworkパッケージによってチャートを結合
g <- (g1 + g2 + g3)
# patchworkパッケージでは、結合したチャートを一括して修正できる
g <- g & geom_point(size = 4)
g <- g & theme_bw(base_family = "MeiryoUI")
print(g)
# 平均
summary(dfMansion)
## No Hirosa Nensu Price
## Min. : 1.00 Min. :38.00 Min. : 1.00 Min. :30.00
## 1st Qu.: 3.25 1st Qu.:51.50 1st Qu.: 4.00 1st Qu.:34.50
## Median : 5.50 Median :60.00 Median : 5.00 Median :44.50
## Mean : 5.50 Mean :60.40 Mean : 8.60 Mean :43.60
## 3rd Qu.: 7.75 3rd Qu.:71.25 3rd Qu.:14.75 3rd Qu.:51.75
## Max. :10.00 Max. :77.00 Max. :22.00 Max. :60.00
# 相関
cor(dfMansion %>% dplyr::select(Price, Hirosa, Nensu))
## Price Hirosa Nensu
## Price 1.0000000 0.7917410 -0.5375242
## Hirosa 0.7917410 1.0000000 0.0367834
## Nensu -0.5375242 0.0367834 1.0000000
# デモンストレーション用の仮のパラメータを定義
lParam <- list(
cand1 = list(a = 8, b = 0.55),
cand2 = list(a = -2, b = 0.8)
)
# 散布図に仮の直線を載せる
g <- ggplot(data = dfMansion, aes(x = Hirosa, y = Price))
g <- g + geom_point(size = 4)
g <- g + geom_abline(intercept = lParam$cand1$a, slope = lParam$cand1$b)
g <- g + geom_abline(intercept = lParam$cand2$a, slope = lParam$cand2$b)
g <- g + labs(x = "広さ(平方m)", y = "価格(百万円)")
g <- g + coord_cartesian(xlim = c(0, 100), ylim = c(0, 100))
g <- g + theme_bw(base_family = "MeiryoUI")
print(g)
# 残差二乗和の計算例
out <- lParam %>%
# purrr::map_dfr() は、リストを受け取り、リストの各要素にたいして
# 関数 .f を適用し、その結果得られるデータフレームに要素名を持つ
# 列(列名は.id)を追加し、縦に積んで返す
map_dfr(
.f = function(lCurrent){
dfMansion %>%
dplyr::select(No, Hirosa, Price) %>%
mutate(gPred = lCurrent$a + Hirosa * lCurrent$b)
},
.id = "sID"
) %>%
# 残差二乗を求める
mutate(gSQError = (Price - gPred)^2) %>%
# 予測値と残差二乗をlongに
gather(varname, value, c(gPred, gSQError)) %>%
# 変数名を作成する
mutate(
varname = paste0(varname, sub("^cand", "", sID))
) %>%
# wideに
dplyr::select(-sID) %>%
spread(varname, value)
# 出力
print(out)
## # A tibble: 10 x 7
## No Hirosa Price gPred1 gPred2 gSQError1 gSQError2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 51 30 36.0 38.8 36.6 77.4
## 2 2 38 32 28.9 28.4 9.61 13.0
## 3 3 57 33 39.4 43.6 40.3 112.
## 4 4 51 39 36.0 38.8 8.70 0.0400
## 5 5 53 44 37.2 40.4 46.9 13.0
## 6 6 77 45 50.4 59.6 28.6 213.
## 7 7 63 45 42.7 48.4 5.52 11.6
## 8 8 69 54 46.0 53.2 64.8 0.640
## 9 9 72 54 47.6 55.6 41.0 2.56
## 10 10 73 60 48.2 56.4 140. 13.0
write.csv(out, "./chap3_code2.csv", row.names = FALSE)
# sub_SQError(): パラメータをもらって残差平方和を返す関数
sub_SQError <- function(gA,gB){
agPred <- gA + gB * dfMansion$Hirosa
sum( (dfMansion$Price - agPred)^2 )
}
# sub_SQErrors(): パラメータのベクトルをもらって
# 要素ごとにsub_SQError()をコールする関数
sub_SQErrors <- function(agA, agB){
stopifnot(length(agA) == length(agB))
sapply(seq_along(agA), function(nID) sub_SQError(agA[nID], agB[nID]))
}
# 描画
agX <- seq(-2, 10, length = 50)
agY <- seq(0.4, 1, length = 50)
mgZ <- outer(agX, agY, sub_SQErrors)
persp(
agX, agY, mgZ,
ticktype = "detailed", expand = 0.5, theta = -20, phi = 45,
xlab = "a", ylab = "b", zlab = ""
)
# 関数を使わずに
x <- dfMansion$Hirosa
y <- dfMansion$Price
b <- sum( (x - mean(x))*(y - mean(y))) / sum( (x - mean(x))^2 )
a <- mean(y) - b * mean(x)
print(a)
## [1] 4.286288
print(b)
## [1] 0.6508893
# 関数を使って
summary(lm(Price ~ Hirosa, data = dfMansion))
##
## Call:
## lm(formula = Price ~ Hirosa, data = dfMansion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.405 -5.684 2.184 4.347 8.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2863 10.9270 0.392 0.70511
## Hirosa 0.6509 0.1775 3.666 0.00635 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.63 on 8 degrees of freedom
## Multiple R-squared: 0.6269, Adjusted R-squared: 0.5802
## F-statistic: 13.44 on 1 and 8 DF, p-value: 0.006346
g <- ggplot(data = dfMansion, aes(x = Hirosa, y = Price))
g <- g + geom_point(size = 4)
g <- g + geom_smooth(method = "lm", color = "red", se = FALSE)
g <- g + labs(x = "広さ(平方m)", y = "価格(百万円)")
g <- g + theme_bw(base_family = "MeiryoUI")
print(g)
## `geom_smooth()` using formula 'y ~ x'
1/(sqrt(2*pi) * 2) * exp(-(12 - 10)^2 / (2*4))
## [1] 0.1209854
dnorm(12, 10, 2)
## [1] 0.1209854
dnorm(12, 10, 2) * dnorm(8, 10, 2) * dnorm(9, 10, 2)
## [1] 0.002576671
# map_dfr()についてはCode 2を参照
out <- lParam %>%
map_dfr(
.f = function(lCurrent){
dfMansion %>%
dplyr::select(No, Hirosa, Price) %>%
mutate(gPred = lCurrent$a + Hirosa * lCurrent$b)
},
.id = "sID"
) %>%
mutate( gL = dnorm(Price, gPred, 5)) %>%
gather(varname, value, c(gPred, gL)) %>%
mutate(varname = paste0(varname, sub("^cand", "", sID))) %>%
dplyr::select(-sID) %>%
spread(varname, value)
print(out)
## # A tibble: 10 x 7
## No Hirosa Price gL1 gL2 gPred1 gPred2
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1 51 30 0.0384 0.0170 36.0 38.8
## 2 2 38 32 0.0658 0.0616 28.9 28.4
## 3 3 57 33 0.0356 0.00843 39.4 43.6
## 4 4 51 39 0.0670 0.0797 36.0 38.8
## 5 5 53 44 0.0312 0.0616 37.2 40.4
## 6 6 77 45 0.0450 0.00112 50.4 59.6
## 7 7 63 45 0.0714 0.0633 42.7 48.4
## 8 8 69 54 0.0218 0.0788 46.0 53.2
## 9 9 72 54 0.0352 0.0758 47.6 55.6
## 10 10 73 60 0.00481 0.0616 48.2 56.4
write.csv(out, "./chap3_code7.csv", row.names = FALSE)
# sub_Likelihood(): パラメータをもらって尤度を返す関数
sub_Likelihood <- function(gA,gB){
agPred <- gA + gB * dfMansion$Hirosa
exp(sum(log(dnorm(dfMansion$Price, agPred, 5))))
}
# sub_Likelihoods(): パラメータのベクトルをもらって
# 要素ごとにsub_Likelihood()をコールする関数
sub_Likelihoods <- function(agA, agB){
stopifnot(length(agA) == length(agB))
sapply(seq_along(agA), function(nID) sub_Likelihood(agA[nID], agB[nID]))
}
# 描画
agX <- seq(-2, 10, length = 50)
agY <- seq(0.4, 1, length = 50)
mgZ <- outer(agX, agY, sub_Likelihoods)
persp(
agX, agY, mgZ,
ticktype = "detailed", expand = 0.5, theta = -20, phi = 45,
xlab = "a", ylab = "b", zlab = ""
)
agPredict <- predict(lm(Price ~ Hirosa, data = dfMansion))
a <- sum((dfMansion$Price - mean(dfMansion$Price))^2)
b <- sum((agPredict - mean(dfMansion$Price))^2)
c <- sum((dfMansion$Price - agPredict)^2)
print(a)
## [1] 942.4
print(b)
## [1] 590.7471
print(c)
## [1] 351.6529
print(b/a)
## [1] 0.6268539
# 関数を使わずに
X <- dfMansion %>%
mutate(Intercept = 1) %>%
dplyr::select(Intercept, Hirosa, Nensu) %>%
as.matrix(.)
Y <- dfMansion %>%
dplyr::select(Price) %>%
as.matrix(.)
betahat <- solve(t(X) %*% X) %*% t(X) %*% Y
print(betahat)
## Price
## Intercept 10.2012955
## Hirosa 0.6680477
## Nensu -0.8082993
# 関数を使って
oModel <- lm(Price ~ Hirosa + Nensu, data = dfMansion)
agCoef <- coef(oModel)
summary(oModel)
##
## Call:
## lm(formula = Price ~ Hirosa + Nensu, data = dfMansion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.2468 -2.0952 0.3939 1.7150 3.6196
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.20130 4.43624 2.300 0.055029 .
## Hirosa 0.66805 0.07065 9.456 3.09e-05 ***
## Nensu -0.80830 0.12241 -6.603 0.000303 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.636 on 7 degrees of freedom
## Multiple R-squared: 0.9484, Adjusted R-squared: 0.9336
## F-statistic: 64.3 on 2 and 7 DF, p-value: 3.126e-05
# チャート
plot3d(
dfMansion[c("Hirosa", "Nensu", "Price")],
size = 5, box = FALSE, axis = FALSE, type = "p"
)
plot3d(
dfMansion[c("Hirosa", "Nensu", "Price")],
size = 5, box = FALSE, axis = FALSE, type = "h", add = TRUE
)
planes3d(agCoef[2], agCoef[3], -1, agCoef[1], col = "blue", alpha = 0.2)
# 関数を使わずに
X <- dfMansion %>%
mutate(Intercept = 1) %>%
dplyr::select(Intercept, Hirosa, Nensu) %>%
as.matrix(.)
Y <- dfMansion %>%
dplyr::select(Price) %>%
as.matrix(.)
betahat <- solve(t(X) %*% X) %*% t(X) %*% Y
ehat <- Y - X %*% betahat
sshat <- as.vector((1/length(Y)) * t(ehat) %*% ehat)
LL <- -(length(Y)/2) * (log(sshat) + 1 + log(2 * pi))
AIC <- -2 * LL + 2 * 4
print(LL)
## [1] -22.09959
print(AIC)
## [1] 52.19917
# 関数を使って
oModel <- lm(Price ~ Hirosa + Nensu, data = dfMansion)
print(logLik(oModel))
## 'log Lik.' -22.09959 (df=4)
print(AIC(oModel))
## [1] 52.19917
以上