require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)

1 Lasso

acs <- read.table("http://jaredlander.com/data/acs_ny.csv", sep = ",",
 header = TRUE, stringsAsFactors = FALSE)

デザイン行列とは \[ y_i = a + b_1 x_{i1} + \cdots + b_k x_{ik} + \epsilon \] と回帰するときに, \[ y_i = \begin{pmatrix}1 &x_1 &\cdots &x_k\end{pmatrix} \begin{pmatrix}a &b_1 &\cdots &b_k\end{pmatrix}^T + \epsilon_i,\\ \] \[ \vec{y} = \begin{pmatrix} 1 &x_{11} &x_{12} &\cdots &x_{1k}\\ \vdots\\ 1 &x_{n1} &x_{n2} &\cdots &x_{nk} \end{pmatrix} \begin{pmatrix} a\\ b_1\\ \vdots\\ b_k \end{pmatrix} + \vec{\epsilon} \]

head(acs)
##   Acres FamilyIncome  FamilyType NumBedrooms NumChildren NumPeople
## 1  1-10          150     Married           4           1         3
## 2  1-10          180 Female Head           3           2         4
## 3  1-10          280 Female Head           4           0         2
## 4  1-10          330 Female Head           2           1         2
## 5  1-10          330   Male Head           3           1         2
## 6  1-10          480   Male Head           0           3         4
##   NumRooms        NumUnits NumVehicles NumWorkers  OwnRent   YearBuilt
## 1        9 Single detached           1          0 Mortgage   1950-1959
## 2        6 Single detached           2          0   Rented Before 1939
## 3        8 Single detached           3          1 Mortgage   2000-2004
## 4        4 Single detached           1          0   Rented   1950-1959
## 5        5 Single attached           1          0 Mortgage Before 1939
## 6        1 Single detached           0          0   Rented Before 1939
##   HouseCosts ElectricBill FoodStamp HeatingFuel Insurance       Language
## 1       1800           90        No         Gas      2500        English
## 2        850           90        No         Oil         0        English
## 3       2600          260        No         Oil      6600 Other European
## 4       1800          140        No         Oil         0        English
## 5        860          150        No         Gas       660        Spanish
## 6        700          140        No         Gas         0        English
require(useful)
## Loading required package: useful
acs$Income <- with(acs, FamilyIncome >= 150000) # logistic regressionの実験のための教師ラベル
acsX <- build.x(Income ~ NumBedrooms + NumChildren + NumPeople +
 NumRooms + NumUnits + NumVehicles + NumWorkers +
 OwnRent + YearBuilt + ElectricBill + FoodStamp +
 HeatingFuel + Insurance + Language - 1,
 data=acs, contrasts=FALSE)

行列acsXの左上を見る。

topleft(acsX)
##   NumBedrooms NumChildren NumPeople NumRooms NumUnitsMobile home
## 1           4           1         3        9                   0
## 2           3           2         4        6                   0
## 3           4           0         2        8                   0
## 4           2           1         2        4                   0
## 5           3           1         2        5                   0
acsY <- build.y(Income ~ NumBedrooms + NumChildren + NumPeople +
 NumRooms + NumUnits + NumVehicles + NumWorkers +
 OwnRent + YearBuilt + ElectricBill + FoodStamp +
 HeatingFuel + Insurance + Language - 1, data=acs) # = acs$Income
require(glmnet)
## Loading required package: glmnet
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following object is masked from 'package:tidyr':
## 
##     expand
## Loading required package: foreach
## Loaded glmnet 2.0-5
set.seed(1863561)
# glmnetでクロスバリデーションを行う
acsCV1 <- cv.glmnet(x = acsX, y = acsY, family = "binomial", nfold = 5)
plot(acsCV1)

American Community Surveyデータにglmnetを適用させたときのクロスバリデ ーション曲線。上部の数字は与えられたlog(λ)に対するモデル中の変数の個数(factor levelsは個別の変数としてカウントされる)。点はクロスバリデーションエラーを示す。点 から伸びる縦の線は信頼区間を示す。2つの垂直線のうち、左は誤差が最小になるλの値を 示し、右は最小値から1標準誤差の範囲に収まるようなλのうち最大の値を示す。

coef(acsCV1)
## 45 x 1 sparse Matrix of class "dgCMatrix"
##                                     1
## (Intercept)             -5.0552170103
## NumBedrooms              0.0542621380
## NumChildren              .           
## NumPeople                .           
## NumRooms                 0.1102021934
## NumUnitsMobile home     -0.8960712560
## NumUnitsSingle attached  .           
## NumUnitsSingle detached  .           
## NumVehicles              0.1283171343
## NumWorkers               0.4806697219
## OwnRentMortgage          .           
## OwnRentOutright          0.2574766773
## OwnRentRented           -0.1790627645
## YearBuilt15              .           
## YearBuilt1940-1949      -0.0253908040
## YearBuilt1950-1959       .           
## YearBuilt1960-1969       .           
## YearBuilt1970-1979      -0.0063336086
## YearBuilt1980-1989       0.0147761442
## YearBuilt1990-1999       .           
## YearBuilt2000-2004       .           
## YearBuilt2005            .           
## YearBuilt2006            .           
## YearBuilt2007            .           
## YearBuilt2008            .           
## YearBuilt2009            .           
## YearBuilt2010            .           
## YearBuiltBefore 1939    -0.1829643904
## ElectricBill             0.0018200312
## FoodStampNo              0.7071289660
## FoodStampYes             .           
## HeatingFuelCoal         -0.2635263281
## HeatingFuelElectricity   .           
## HeatingFuelGas           .           
## HeatingFuelNone          .           
## HeatingFuelOil           .           
## HeatingFuelOther         .           
## HeatingFuelSolar         .           
## HeatingFuelWood         -0.7454315355
## Insurance                0.0004973315
## LanguageAsian Pacific    0.3606176925
## LanguageEnglish          .           
## LanguageOther            .           
## LanguageOther European   0.0389641675
## LanguageSpanish          .

. は選ばれなかったパラメータを指す.

2 Ridge regression

set.seed(71623)
acsCV2 <- cv.glmnet(x = acsX, y = acsY, family = "binomial", nfold = 5,
alpha = 0)
plot(acsCV2)

coefplot(acsCV2)
## Warning: Removed 45 rows containing missing values (geom_errorbarh).

## Warning: Removed 45 rows containing missing values (geom_errorbarh).

ridge回帰ではスパースにならない。

3 Bayesian shrinkage

load("../data/ideo.rdata")
head(ideo)
##   Year       Vote Age Gender  Race                             Education
## 1 1948   democrat  NA   male white     grade school of less (0-8 grades)
## 2 1948 republican  NA female white high school (12 grades or fewer, incl
## 3 1948   democrat  NA female white high school (12 grades or fewer, incl
## 4 1948 republican  NA female white some college(13 grades or more,but no
## 5 1948   democrat  NA   male white some college(13 grades or more,but no
## 6 1948 republican  NA female white high school (12 grades or fewer, incl
##                 Income                  Religion
## 1  34 to 67 percentile                protestant
## 2 96 to 100 percentile                protestant
## 3  68 to 95 percentile catholic (roman catholic)
## 4 96 to 100 percentile                protestant
## 5  68 to 95 percentile catholic (roman catholic)
## 6 96 to 100 percentile                protestant

3.1 事前分布なし

theYears <- unique(ideo$Year)
# 結果を保持するために、上で作った年 vector と同じ長さの空の list を作成
# 事前に長さを決めておけば実行速度が速くなる
results <- vector(mode="list", length=length(theYears))
# この list の要素に適切な名前を与える
names(results) <- theYears
# 各年ごとのデータに対してモデルを適合させる
for(i in theYears)
{
results[[as.character(i)]] <- glm(Vote ~ Race + Income + Gender +
Education,
data=ideo, subset=Year==i,
family=binomial(link="logit"))
}
g <- multiplot(results, coefficients="Raceblack", secret.weapon = T) + coord_flip(xlim=c(-20, 10))
ggplotly(g)

3.2 事前分布あり

require(arm)
## Loading required package: arm
## Loading required package: lme4
## 
## arm (Version 1.9-1, built: 2016-8-21)
## Working directory is /Users/yoshi/Documents/petraWorkspace/ML_study/shrinkage/doc
## 
## Attaching package: 'arm'
## The following objects are masked from 'package:coefplot':
## 
##     coefplot, coefplot.default
resultsB <- vector(mode="list", length=length(theYears))
names(resultsB) <- theYears
for(i in theYears)
{
# 尺度パラメータ2.5のコーシー事前分布を持つモデルに適合させる
resultsB[[as.character(i)]] <-
arm::bayesglm(Vote ~ Race + Income + Gender + Education,
data=ideo[ideo$Year==i, ],
family=binomial(link="logit"),
prior.scale=2.5, prior.df=1)
}
multiplot(resultsB, coefficients="Raceblack", secret.weapon=TRUE)

フィッティングがよくなっていることが分かる。