Abstract
みんなのR 19章の勉強ノートrequire(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
acs <- read.table("http://jaredlander.com/data/acs_ny.csv", sep = ",",
header = TRUE, stringsAsFactors = FALSE)
デザイン行列とは \[ y_i = a + b_1 x_{i1} + \cdots + b_k x_{ik} + \epsilon \] と回帰するときに, \[ y_i = \begin{pmatrix}1 &x_1 &\cdots &x_k\end{pmatrix} \begin{pmatrix}a &b_1 &\cdots &b_k\end{pmatrix}^T + \epsilon_i,\\ \] \[ \vec{y} = \begin{pmatrix} 1 &x_{11} &x_{12} &\cdots &x_{1k}\\ \vdots\\ 1 &x_{n1} &x_{n2} &\cdots &x_{nk} \end{pmatrix} \begin{pmatrix} a\\ b_1\\ \vdots\\ b_k \end{pmatrix} + \vec{\epsilon} \]
head(acs)
## Acres FamilyIncome FamilyType NumBedrooms NumChildren NumPeople
## 1 1-10 150 Married 4 1 3
## 2 1-10 180 Female Head 3 2 4
## 3 1-10 280 Female Head 4 0 2
## 4 1-10 330 Female Head 2 1 2
## 5 1-10 330 Male Head 3 1 2
## 6 1-10 480 Male Head 0 3 4
## NumRooms NumUnits NumVehicles NumWorkers OwnRent YearBuilt
## 1 9 Single detached 1 0 Mortgage 1950-1959
## 2 6 Single detached 2 0 Rented Before 1939
## 3 8 Single detached 3 1 Mortgage 2000-2004
## 4 4 Single detached 1 0 Rented 1950-1959
## 5 5 Single attached 1 0 Mortgage Before 1939
## 6 1 Single detached 0 0 Rented Before 1939
## HouseCosts ElectricBill FoodStamp HeatingFuel Insurance Language
## 1 1800 90 No Gas 2500 English
## 2 850 90 No Oil 0 English
## 3 2600 260 No Oil 6600 Other European
## 4 1800 140 No Oil 0 English
## 5 860 150 No Gas 660 Spanish
## 6 700 140 No Gas 0 English
require(useful)
## Loading required package: useful
acs$Income <- with(acs, FamilyIncome >= 150000) # logistic regressionの実験のための教師ラベル
acsX <- build.x(Income ~ NumBedrooms + NumChildren + NumPeople +
NumRooms + NumUnits + NumVehicles + NumWorkers +
OwnRent + YearBuilt + ElectricBill + FoodStamp +
HeatingFuel + Insurance + Language - 1,
data=acs, contrasts=FALSE)
行列acsXの左上を見る。
topleft(acsX)
## NumBedrooms NumChildren NumPeople NumRooms NumUnitsMobile home
## 1 4 1 3 9 0
## 2 3 2 4 6 0
## 3 4 0 2 8 0
## 4 2 1 2 4 0
## 5 3 1 2 5 0
acsY <- build.y(Income ~ NumBedrooms + NumChildren + NumPeople +
NumRooms + NumUnits + NumVehicles + NumWorkers +
OwnRent + YearBuilt + ElectricBill + FoodStamp +
HeatingFuel + Insurance + Language - 1, data=acs) # = acs$Income
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
##
## expand
## Loading required package: foreach
## Loaded glmnet 2.0-5
set.seed(1863561)
# glmnetでクロスバリデーションを行う
acsCV1 <- cv.glmnet(x = acsX, y = acsY, family = "binomial", nfold = 5)
plot(acsCV1)
American Community Surveyデータにglmnetを適用させたときのクロスバリデ ーション曲線。上部の数字は与えられたlog(λ)に対するモデル中の変数の個数(factor levelsは個別の変数としてカウントされる)。点はクロスバリデーションエラーを示す。点 から伸びる縦の線は信頼区間を示す。2つの垂直線のうち、左は誤差が最小になるλの値を 示し、右は最小値から1標準誤差の範囲に収まるようなλのうち最大の値を示す。
coef(acsCV1)
## 45 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -5.0552170103
## NumBedrooms 0.0542621380
## NumChildren .
## NumPeople .
## NumRooms 0.1102021934
## NumUnitsMobile home -0.8960712560
## NumUnitsSingle attached .
## NumUnitsSingle detached .
## NumVehicles 0.1283171343
## NumWorkers 0.4806697219
## OwnRentMortgage .
## OwnRentOutright 0.2574766773
## OwnRentRented -0.1790627645
## YearBuilt15 .
## YearBuilt1940-1949 -0.0253908040
## YearBuilt1950-1959 .
## YearBuilt1960-1969 .
## YearBuilt1970-1979 -0.0063336086
## YearBuilt1980-1989 0.0147761442
## YearBuilt1990-1999 .
## YearBuilt2000-2004 .
## YearBuilt2005 .
## YearBuilt2006 .
## YearBuilt2007 .
## YearBuilt2008 .
## YearBuilt2009 .
## YearBuilt2010 .
## YearBuiltBefore 1939 -0.1829643904
## ElectricBill 0.0018200312
## FoodStampNo 0.7071289660
## FoodStampYes .
## HeatingFuelCoal -0.2635263281
## HeatingFuelElectricity .
## HeatingFuelGas .
## HeatingFuelNone .
## HeatingFuelOil .
## HeatingFuelOther .
## HeatingFuelSolar .
## HeatingFuelWood -0.7454315355
## Insurance 0.0004973315
## LanguageAsian Pacific 0.3606176925
## LanguageEnglish .
## LanguageOther .
## LanguageOther European 0.0389641675
## LanguageSpanish .
. は選ばれなかったパラメータを指す.
set.seed(71623)
acsCV2 <- cv.glmnet(x = acsX, y = acsY, family = "binomial", nfold = 5,
alpha = 0)
plot(acsCV2)
coefplot(acsCV2)
## Warning: Removed 45 rows containing missing values (geom_errorbarh).
## Warning: Removed 45 rows containing missing values (geom_errorbarh).
ridge回帰ではスパースにならない。
load("../data/ideo.rdata")
head(ideo)
## Year Vote Age Gender Race Education
## 1 1948 democrat NA male white grade school of less (0-8 grades)
## 2 1948 republican NA female white high school (12 grades or fewer, incl
## 3 1948 democrat NA female white high school (12 grades or fewer, incl
## 4 1948 republican NA female white some college(13 grades or more,but no
## 5 1948 democrat NA male white some college(13 grades or more,but no
## 6 1948 republican NA female white high school (12 grades or fewer, incl
## Income Religion
## 1 34 to 67 percentile protestant
## 2 96 to 100 percentile protestant
## 3 68 to 95 percentile catholic (roman catholic)
## 4 96 to 100 percentile protestant
## 5 68 to 95 percentile catholic (roman catholic)
## 6 96 to 100 percentile protestant
theYears <- unique(ideo$Year)
# 結果を保持するために、上で作った年 vector と同じ長さの空の list を作成
# 事前に長さを決めておけば実行速度が速くなる
results <- vector(mode="list", length=length(theYears))
# この list の要素に適切な名前を与える
names(results) <- theYears
# 各年ごとのデータに対してモデルを適合させる
for(i in theYears)
{
results[[as.character(i)]] <- glm(Vote ~ Race + Income + Gender +
Education,
data=ideo, subset=Year==i,
family=binomial(link="logit"))
}
g <- multiplot(results, coefficients="Raceblack", secret.weapon = T) + coord_flip(xlim=c(-20, 10))
ggplotly(g)
require(arm)
## Loading required package: arm
## Loading required package: lme4
##
## arm (Version 1.9-1, built: 2016-8-21)
## Working directory is /Users/yoshi/Documents/petraWorkspace/ML_study/shrinkage/doc
##
## Attaching package: 'arm'
## The following objects are masked from 'package:coefplot':
##
## coefplot, coefplot.default
resultsB <- vector(mode="list", length=length(theYears))
names(resultsB) <- theYears
for(i in theYears)
{
# 尺度パラメータ2.5のコーシー事前分布を持つモデルに適合させる
resultsB[[as.character(i)]] <-
arm::bayesglm(Vote ~ Race + Income + Gender + Education,
data=ideo[ideo$Year==i, ],
family=binomial(link="logit"),
prior.scale=2.5, prior.df=1)
}
multiplot(resultsB, coefficients="Raceblack", secret.weapon=TRUE)
フィッティングがよくなっていることが分かる。