require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)

1 Lasso

acs <- read.table("http://jaredlander.com/data/acs_ny.csv", sep = ",",
 header = TRUE, stringsAsFactors = FALSE)

デザイン行列とは \[ y_i = a + b_1 x_{i1} + \cdots + b_k x_{ik} + \epsilon \] と回帰するときに, \[ y_i = \begin{pmatrix}1 &x_1 &\cdots &x_k\end{pmatrix} \begin{pmatrix}a &b_1 &\cdots &b_k\end{pmatrix}^T + \epsilon_i,\\ \] \[ \vec{y} = \begin{pmatrix} 1 &x_{11} &x_{12} &\cdots &x_{1k}\\ \vdots\\ 1 &x_{n1} &x_{n2} &\cdots &x_{nk} \end{pmatrix} \begin{pmatrix} a\\ b_1\\ \vdots\\ b_k \end{pmatrix} + \vec{\epsilon} \]

head(acs)

##   Acres FamilyIncome  FamilyType NumBedrooms NumChildren NumPeople
## 1  1-10          150     Married           4           1         3
## 2  1-10          180 Female Head           3           2         4
## 3  1-10          280 Female Head           4           0         2
## 4  1-10          330 Female Head           2           1         2
## 5  1-10          330   Male Head           3           1         2
## 6  1-10          480   Male Head           0           3         4
##   NumRooms        NumUnits NumVehicles NumWorkers  OwnRent   YearBuilt
## 1        9 Single detached           1          0 Mortgage   1950-1959
## 2        6 Single detached           2          0   Rented Before 1939
## 3        8 Single detached           3          1 Mortgage   2000-2004
## 4        4 Single detached           1          0   Rented   1950-1959
## 5        5 Single attached           1          0 Mortgage Before 1939
## 6        1 Single detached           0          0   Rented Before 1939
##   HouseCosts ElectricBill FoodStamp HeatingFuel Insurance       Language
## 1       1800           90        No         Gas      2500        English
## 2        850           90        No         Oil         0        English
## 3       2600          260        No         Oil      6600 Other European
## 4       1800          140        No         Oil         0        English
## 5        860          150        No         Gas       660        Spanish
## 6        700          140        No         Gas         0        English

require(useful)

## Loading required package: useful

acs$Income <- with(acs, FamilyIncome >= 150000) # logistic regressionの実験のための教師ラベル

acsX <- build.x(Income ~ NumBedrooms + NumChildren + NumPeople +
 NumRooms + NumUnits + NumVehicles + NumWorkers +
 OwnRent + YearBuilt + ElectricBill + FoodStamp +
 HeatingFuel + Insurance + Language - 1,
 data=acs, contrasts=FALSE)

行列acsXの左上を見る。

topleft(acsX)

##   NumBedrooms NumChildren NumPeople NumRooms NumUnitsMobile home
## 1           4           1         3        9                   0
## 2           3           2         4        6                   0
## 3           4           0         2        8                   0
## 4           2           1         2        4                   0
## 5           3           1         2        5                   0

acsY <- build.y(Income ~ NumBedrooms + NumChildren + NumPeople +
 NumRooms + NumUnits + NumVehicles + NumWorkers +
 OwnRent + YearBuilt + ElectricBill + FoodStamp +
 HeatingFuel + Insurance + Language - 1, data=acs) # = acs$Income

require(glmnet)

## Loading required package: glmnet

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following object is masked from 'package:tidyr':
## 
##     expand

## Loading required package: foreach

## Loaded glmnet 2.0-5

set.seed(1863561)
# glmnetでクロスバリデーションを行う
acsCV1 <- cv.glmnet(x = acsX, y = acsY, family = "binomial", nfold = 5)

plot(acsCV1)

American Community Surveyデータにglmnetを適用させたときのクロスバリデーション曲線。上部の数字は与えられたlog（λ）に対するモデル中の変数の個数（factor levelsは個別の変数としてカウントされる）。点はクロスバリデーションエラーを示す。点から伸びる縦の線は信頼区間を示す。2つの垂直線のうち、左は誤差が最小になるλの値を示し、右は最小値から1標準誤差の範囲に収まるようなλのうち最大の値を示す。

coef(acsCV1)

## 45 x 1 sparse Matrix of class "dgCMatrix"
##                                     1
## (Intercept)             -5.0552170103
## NumBedrooms              0.0542621380
## NumChildren              .           
## NumPeople                .           
## NumRooms                 0.1102021934
## NumUnitsMobile home     -0.8960712560
## NumUnitsSingle attached  .           
## NumUnitsSingle detached  .           
## NumVehicles              0.1283171343
## NumWorkers               0.4806697219
## OwnRentMortgage          .           
## OwnRentOutright          0.2574766773
## OwnRentRented           -0.1790627645
## YearBuilt15              .           
## YearBuilt1940-1949      -0.0253908040
## YearBuilt1950-1959       .           
## YearBuilt1960-1969       .           
## YearBuilt1970-1979      -0.0063336086
## YearBuilt1980-1989       0.0147761442
## YearBuilt1990-1999       .           
## YearBuilt2000-2004       .           
## YearBuilt2005            .           
## YearBuilt2006            .           
## YearBuilt2007            .           
## YearBuilt2008            .           
## YearBuilt2009            .           
## YearBuilt2010            .           
## YearBuiltBefore 1939    -0.1829643904
## ElectricBill             0.0018200312
## FoodStampNo              0.7071289660
## FoodStampYes             .           
## HeatingFuelCoal         -0.2635263281
## HeatingFuelElectricity   .           
## HeatingFuelGas           .           
## HeatingFuelNone          .           
## HeatingFuelOil           .           
## HeatingFuelOther         .           
## HeatingFuelSolar         .           
## HeatingFuelWood         -0.7454315355
## Insurance                0.0004973315
## LanguageAsian Pacific    0.3606176925
## LanguageEnglish          .           
## LanguageOther            .           
## LanguageOther European   0.0389641675
## LanguageSpanish          .

. は選ばれなかったパラメータを指す.

2 Ridge regression

set.seed(71623)
acsCV2 <- cv.glmnet(x = acsX, y = acsY, family = "binomial", nfold = 5,
alpha = 0)

plot(acsCV2)

coefplot(acsCV2)

## Warning: Removed 45 rows containing missing values (geom_errorbarh).

## Warning: Removed 45 rows containing missing values (geom_errorbarh).

ridge回帰ではスパースにならない。

3 Bayesian shrinkage

load("../data/ideo.rdata")
head(ideo)

##   Year       Vote Age Gender  Race                             Education
## 1 1948   democrat  NA   male white     grade school of less (0-8 grades)
## 2 1948 republican  NA female white high school (12 grades or fewer, incl
## 3 1948   democrat  NA female white high school (12 grades or fewer, incl
## 4 1948 republican  NA female white some college(13 grades or more,but no
## 5 1948   democrat  NA   male white some college(13 grades or more,but no
## 6 1948 republican  NA female white high school (12 grades or fewer, incl
##                 Income                  Religion
## 1  34 to 67 percentile                protestant
## 2 96 to 100 percentile                protestant
## 3  68 to 95 percentile catholic (roman catholic)
## 4 96 to 100 percentile                protestant
## 5  68 to 95 percentile catholic (roman catholic)
## 6 96 to 100 percentile                protestant

3.1 事前分布なし

theYears <- unique(ideo$Year)
# 結果を保持するために、上で作った年 vector と同じ長さの空の list を作成
# 事前に長さを決めておけば実行速度が速くなる
results <- vector(mode="list", length=length(theYears))
# この list の要素に適切な名前を与える
names(results) <- theYears

# 各年ごとのデータに対してモデルを適合させる
for(i in theYears)
{
results[[as.character(i)]] <- glm(Vote ~ Race + Income + Gender +
Education,
data=ideo, subset=Year==i,
family=binomial(link="logit"))
}

g <- multiplot(results, coefficients="Raceblack", secret.weapon = T) + coord_flip(xlim=c(-20, 10))
ggplotly(g)

3.2 事前分布あり

require(arm)

## Loading required package: arm

## Loading required package: lme4

## 
## arm (Version 1.9-1, built: 2016-8-21)

## Working directory is /Users/yoshi/Documents/petraWorkspace/ML_study/shrinkage/doc

## 
## Attaching package: 'arm'

## The following objects are masked from 'package:coefplot':
## 
##     coefplot, coefplot.default

resultsB <- vector(mode="list", length=length(theYears))
names(resultsB) <- theYears
for(i in theYears)
{
# 尺度パラメータ2.5のコーシー事前分布を持つモデルに適合させる
resultsB[[as.character(i)]] <-
arm::bayesglm(Vote ~ Race + Income + Gender + Education,
data=ideo[ideo$Year==i, ],
family=binomial(link="logit"),
prior.scale=2.5, prior.df=1)
}

multiplot(resultsB, coefficients="Raceblack", secret.weapon=TRUE)

フィッティングがよくなっていることが分かる。

正則化と縮小

yoshi

2016-09-21

1 Lasso

2 Ridge regression

3 Bayesian shrinkage

3.1 事前分布なし

3.2 事前分布あり