2.1 労働市場における人種差別
setwd("/cloud/project/CAUSALITY")
library(readr)
resume <- read_csv("resume.csv")
## Rows: 4870 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): firstname, sex, race
## dbl (1): call
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(resume)
## [1] 4870 4
head(resume)
## # A tibble: 6 × 4
## firstname sex race call
## <chr> <chr> <chr> <dbl>
## 1 Allison female white 0
## 2 Kristen female white 0
## 3 Lakisha female black 0
## 4 Latonya female black 0
## 5 Carrie female white 0
## 6 Jay male white 0
summary(resume)
## firstname sex race call
## Length:4870 Length:4870 Length:4870 Min. :0.00000
## Class :character Class :character Class :character 1st Qu.:0.00000
## Mode :character Mode :character Mode :character Median :0.00000
## Mean :0.08049
## 3rd Qu.:0.00000
## Max. :1.00000
race.call.tab <- table(race = resume$race, call = resume$call)
race.call.tab
## call
## race 0 1
## black 2278 157
## white 2200 235
addmargins(race.call.tab)
## call
## race 0 1 Sum
## black 2278 157 2435
## white 2200 235 2435
## Sum 4478 392 4870
## 審査通過率(全体の審査通過数を標本サイズで割ったもの)
sum(race.call.tab[, 2])/nrow(resume)
## [1] 0.08049281
(157+235)/4870
## [1] 0.08049281
## 人種ごとの審査通過率
race.call.tab[1, 2] / sum(race.call.tab[1, ]) #黒人
## [1] 0.06447639
157/(2278+157)
## [1] 0.06447639
race.call.tab[2, 2] / sum(race.call.tab[2, ]) #白人
## [1] 0.09650924
235/(2200+235)
## [1] 0.09650924
race.call.tab[1, ] #1行目
## 0 1
## 2278 157
race.call.tab[ ,2]
## black white
## 157 235
mean(resume$call)
## [1] 0.08049281
library(readxl)
asa <- read_excel("asa.xlsx")
mean(asa$asa)
## [1] 0.75
2.2 Rでデータを部分集合化する
2.2.1 論理値と論理演算子
class(TRUE)
## [1] "logical"
as.integer(TRUE)
## [1] 1
as.integer(FALSE)
## [1] 0
x <- c(TRUE, FALSE, TRUE) #論理値のベクトル
mean(x) #TRUEの割合
## [1] 0.6666667
sum(x) #TRUEの個数
## [1] 2
FALSE & TRUE
## [1] FALSE
TRUE & TRUE
## [1] TRUE
TRUE | FALSE
## [1] TRUE
FALSE | FALSE
## [1] FALSE
TRUE & FALSE & TRUE
## [1] FALSE
(TRUE | FALSE) & FALSE
## [1] FALSE
TRUE | (FALSE & FALSE)
## [1] TRUE
TF1 <- c(TRUE, FALSE, FALSE)
TF2 <- c(TRUE, FALSE, TRUE)
TF1 | TF2
## [1] TRUE FALSE TRUE
TF1 & TF2
## [1] TRUE FALSE FALSE
4 > 3
## [1] TRUE
"Hello" == "hello"
## [1] FALSE
"Hello" != "hello"
## [1] TRUE
x <- c(3, 2, 1, -2, -1)
x >= 2
## [1] TRUE TRUE FALSE FALSE FALSE
x != 1
## [1] TRUE TRUE FALSE TRUE TRUE
## 論理値をもつ2つのベクトルの論理積
(x > 0) & (x <=2)
## [1] FALSE TRUE TRUE FALSE FALSE
## 論理値をもつ2つのベクトルの論理和
(x > 2) | (x <= -1)
## [1] TRUE FALSE FALSE TRUE TRUE
x.int <- (x > 0) & (x <=2) # 論理ベクトル
x.int
## [1] FALSE TRUE TRUE FALSE FALSE
mean(x.int)
## [1] 0.4
sum(x.int)
## [1] 2
2.2.3 部分集合化
## 黒人らしい名前の審査通過率
mean(resume$call[resume$race == "black"])
## [1] 0.06447639
## 最初の5つの観察の人種
resume$race[1:5]
## [1] "white" "white" "black" "black" "white"
## 最初の5つの観察を比較
(resume$race == "black")[1:5]
## [1] FALSE FALSE TRUE TRUE FALSE
dim(resume) #元のデータフレームに含まれる行・列の数
## [1] 4870 4
## 黒人のみ部分集合化する
resumeB <- resume[resume$race == "black",]
dim(resumeB) # このデータフレームは元のデータフレームよりも行数が少ない
## [1] 2435 4
mean(resumeB$call)
## [1] 0.06447639
## call変数とfirstname変数を残す
## 同時に、黒人女性らしい名前の観察を残す
resumeBf <- subset(resume, select = c("call", "firstname"), subset = (race == "black" & sex == "female"))
head(resumeBf)
## # A tibble: 6 × 2
## call firstname
## <dbl> <chr>
## 1 0 Lakisha
## 2 0 Latonya
## 3 0 Kenya
## 4 0 Latonya
## 5 0 Aisha
## 6 0 Aisha
## 同じ結果を得る別のシンタックス
resumeBf <- resume[resume$race == "black" & resume$sex == "female", c("call", "firstname")]
## 黒人男性
resumeBm <- subset(resume, subset = (race == "black") & (sex == "male"))
## 白人女性
resumeWf <- subset(resume, subset = (race == "white") & (sex == "female"))
## 白人男性
resumeWm <- subset(resume, subset = (race == "white") & (sex == "male"))
## 人種格差
mean(resumeWf$call) - mean(resumeBf$call) # 女性同士
## [1] 0.03264689
mean(resumeWm$call) - mean(resumeBm$call) # 男性同士
## [1] 0.03040786
2.2.4 簡単な条件文
resume$BlackFemale <- ifelse(resume$race == "black" & resume$sex == "female", 1, 0)
table(race = resume$race, sex = resume$sex, BlackFemale = resume$BlackFemale)
## , , BlackFemale = 0
##
## sex
## race female male
## black 0 549
## white 1860 575
##
## , , BlackFemale = 1
##
## sex
## race female male
## black 1886 0
## white 0 0
2.2.5 因子変数
resume$type <- NA
resume$type[resume$race == "black" & resume$sex == "female"] <- "BlackFemale"
resume$type[resume$race == "black" & resume$sex == "male"] <- "BlackMale"
resume$type[resume$race == "white" & resume$sex == "female"] <- "WhiteFemale"
resume$type[resume$race == "white" & resume$sex == "male"] <- "WhiteMale"
## オブジェクトのクラスを確認する
class(resume$type)
## [1] "character"
## 新しい文字変数を因子変数に変換する
resume$type <- as.factor(resume$type)
## 因子変数のレベルのリストを出す
levels(resume$type)
## [1] "BlackFemale" "BlackMale" "WhiteFemale" "WhiteMale"
## 各レベルの観察の数を取り出す
table(resume$type)
##
## BlackFemale BlackMale WhiteFemale WhiteMale
## 1886 549 1860 575
tapply(resume$call, resume$type, mean)
## BlackFemale BlackMale WhiteFemale WhiteMale
## 0.06627784 0.05828780 0.09892473 0.08869565
## 名前を因子変数に変換する
resume$firstname <- as.factor(resume$firstname)
## それぞれの名前の審査通過率を計算する
callback.name <- tapply(resume$call, resume$firstname, mean)
## 結果を昇順に並び替える
sort(callback.name)
## Aisha Rasheed Keisha Tremayne Kareem Darnell Tyrone
## 0.02222222 0.02985075 0.03825137 0.04347826 0.04687500 0.04761905 0.05333333
## Hakim Tamika Lakisha Tanisha Todd Jamal Neil
## 0.05454545 0.05468750 0.05500000 0.05797101 0.05882353 0.06557377 0.06578947
## Brett Geoffrey Brendan Greg Emily Anne Jill
## 0.06779661 0.06779661 0.07692308 0.07843137 0.07929515 0.08264463 0.08374384
## Latoya Kenya Matthew Latonya Leroy Allison Ebony
## 0.08407080 0.08673469 0.08955224 0.09130435 0.09375000 0.09482759 0.09615385
## Jermaine Laurie Sarah Meredith Carrie Kristen Jay
## 0.09615385 0.09743590 0.09844560 0.10160428 0.13095238 0.13145540 0.13432836
## Brad
## 0.15873016
2.3 因果効果と反事実
resume[1,]
## # A tibble: 1 × 6
## firstname sex race call BlackFemale type
## <fct> <chr> <chr> <dbl> <dbl> <fct>
## 1 Allison female white 0 0 WhiteFemale
2.4 ランダム化比較試験
library(readr)
social <- read_csv("social.csv") #データの読み込み
## Rows: 305866 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): sex, messages
## dbl (4): yearofbirth, primary2004, primary2006, hhsize
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(social) #データの要約
## sex yearofbirth primary2004 messages
## Length:305866 Min. :1900 Min. :0.0000 Length:305866
## Class :character 1st Qu.:1947 1st Qu.:0.0000 Class :character
## Mode :character Median :1956 Median :0.0000 Mode :character
## Mean :1956 Mean :0.4014
## 3rd Qu.:1965 3rd Qu.:1.0000
## Max. :1986 Max. :1.0000
## primary2006 hhsize
## Min. :0.0000 Min. :1.000
## 1st Qu.:0.0000 1st Qu.:2.000
## Median :0.0000 Median :2.000
## Mean :0.3122 Mean :2.184
## 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :1.0000 Max. :8.000
## 各グループの投票率
tapply(social$primary2006, social$messages, mean)
## Civic Duty Control Hawthorne Neighbors
## 0.3145377 0.2966383 0.3223746 0.3779482
## コントロールグループの投票率
mean(social$primary2006[social$messages == "Control"])
## [1] 0.2966383
## 各グループの投票率からコントロールグループの投票率を引く
tapply(social$primary2006, social$messages, mean) - mean(social$primary2006[social$messages == "Control"])
## Civic Duty Control Hawthorne Neighbors
## 0.01789934 0.00000000 0.02573631 0.08130991
social$age <- 2006 - social$yearofbirth # 年齢変数を作成する
# トリートメント前の変数のバランス確認
tapply(social$age, social$messages, mean)
## Civic Duty Control Hawthorne Neighbors
## 49.65904 49.81355 49.70480 49.85294
tapply(social$primary2004, social$messages, mean)
## Civic Duty Control Hawthorne Neighbors
## 0.3994453 0.4003388 0.4032300 0.4066647
tapply(social$hhsize, social$messages, mean)
## Civic Duty Control Hawthorne Neighbors
## 2.189126 2.183667 2.180138 2.187770
2.5 観察研究
library(readr)
minwage <- read_csv("minwage.csv")
## Rows: 358 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): chain, location
## dbl (6): wageBefore, wageAfter, fullBefore, fullAfter, partBefore, partAfter
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dim(minwage)
## [1] 358 8
summary(minwage)
## chain location wageBefore wageAfter
## Length:358 Length:358 Min. :4.250 Min. :4.250
## Class :character Class :character 1st Qu.:4.250 1st Qu.:5.050
## Mode :character Mode :character Median :4.500 Median :5.050
## Mean :4.618 Mean :4.994
## 3rd Qu.:4.987 3rd Qu.:5.050
## Max. :5.750 Max. :6.250
## fullBefore fullAfter partBefore partAfter
## Min. : 0.000 Min. : 0.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 2.125 1st Qu.: 2.000 1st Qu.:11.00 1st Qu.:11.00
## Median : 6.000 Median : 6.000 Median :16.25 Median :17.00
## Mean : 8.475 Mean : 8.362 Mean :18.75 Mean :18.69
## 3rd Qu.:12.000 3rd Qu.:12.000 3rd Qu.:25.00 3rd Qu.:25.00
## Max. :60.000 Max. :40.000 Max. :60.00 Max. :60.00
## 2つの州ごとにデータを部分集合化する
minwageNJ <- subset(minwage, subset = (location != "PA"))
minwagePA <- subset(minwage, subset = (location == "PA"))
## 賃金が5.05ドル以下のファーストフード店の割合
mean(minwageNJ$wageBefore < 5.05) # 法施行「前」のNJ
## [1] 0.9106529
mean(minwageNJ$wageAfter < 5.05) # 法施行「後」のNJ
## [1] 0.003436426
mean(minwagePA$wageBefore < 5.05) # 法施行「前」のPA
## [1] 0.9402985
mean(minwagePA$wageAfter < 5.05) # 法施行「後」のPA
## [1] 0.9552239
## NJとPAにおける常勤労働者の割合を表す変数を作成する
minwageNJ$fullPropAfter <- minwageNJ$fullAfter / (minwageNJ$fullAfter + minwageNJ$partAfter)
minwagePA$fullPropAfter <- minwagePA$fullAfter / (minwagePA$fullAfter + minwagePA$partAfter)
## 平均の差を計算する
mean(minwageNJ$fullPropAfter) - mean(minwagePA$fullPropAfter)
## [1] 0.04811886
# ファーストフードチェーンの割合の表
prop.table(table(minwageNJ$chain))
##
## burgerking kfc roys wendys
## 0.4054983 0.2233677 0.2508591 0.1202749
# ファーストフードチェーンの割合の表
prop.table(table(minwagePA$chain))
##
## burgerking kfc roys wendys
## 0.4626866 0.1492537 0.2238806 0.1641791
## バーガーキングのみ部分集合化する
minwageNJ.bk <- subset(minwageNJ, subset = (chain == "burgerking"))
minwagePA.bk <- subset(minwagePA, subset = (chain == "burgerking"))
## 常勤雇用の割合の比較
mean(minwageNJ.bk$fullPropAfter) - mean(minwagePA.bk$fullPropAfter)
## [1] 0.03643934
minwageNJ.bk.subset <- subset(minwageNJ.bk, subset = ((location != "shoreNJ") & (location != "centralNJ")))
mean(minwageNJ.bk.subset$fullPropAfter) - mean(minwagePA.bk$fullPropAfter)
## [1] 0.03149853
2.5.3 事前・事後の比較と差の差分法
## NJでの法施行前の常勤雇用の割合
minwageNJ$fullPropBefore <- minwageNJ$fullBefore / (minwageNJ$fullBefore + minwageNJ$partBefore)
## 最低賃金引き上げ前後の平均の差
NJdiff <- mean(minwageNJ$fullPropAfter) - mean(minwageNJ$fullPropBefore)
NJdiff
## [1] 0.02387474
# 教科書ではやっていない計算
# NJのBeforeの常勤雇用の割合
N <- mean(minwageNJ$fullPropBefore)
# PAのBeforeの常勤雇用の割合
minwagePA$fullPropBefore <- minwagePA$fullBefore / (minwagePA$fullBefore + minwagePA$partBefore)
P <- mean(minwagePA$fullPropBefore)
# NJのAfterの常勤雇用の割合
Ndash <- mean(minwageNJ$fullPropAfter)
# PAのAfterの常勤雇用の割合
Pdash <- mean(minwagePA$fullPropAfter)
# NJの反事実のAfterの常勤雇用の割合
Nstar <- N + (Pdash - P)
Nstar
## [1] 0.2588427
ATT <- Ndash - Nstar
ATT
## [1] 0.06155831
2.6 1変数の記述統計量
2.6.1 分位数
## NJとPAの横断的な比較
median(minwageNJ$fullPropAfter) - median(minwagePA$fullPropAfter)
## [1] 0.07291667
## 事前・事後の比較
NJdiff_med <- median(minwageNJ$fullPropAfter) - median(minwageNJ$fullPropBefore)
NJdiff_med
## [1] 0.025
## 差の差分の中央値
PAdiff_med <- median(minwagePA$fullPropAfter) - median(minwagePA$fullPropBefore)
PAdiff_med
## [1] -0.01201923
NJdiff_med - PAdiff_med
## [1] 0.03701923
## summary では最小値、最大値、平均とともに四分位数が表示される
summary(minwageNJ$wageBefore)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.25 4.25 4.50 4.61 4.87 5.75
summary(minwageNJ$wageAfter)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 5.000 5.050 5.050 5.081 5.050 5.750
# 四分位範囲
IQR(minwageNJ$wageBefore)
## [1] 0.62
4.87-4.25
## [1] 0.62
IQR(minwageNJ$wageAfter)
## [1] 0
5.05-5.05
## [1] 0
# 十分位数
quantile(minwageNJ$wageBefore, probs = seq(from = 0, to = 1, by = 0.1))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 4.25 4.25 4.25 4.25 4.50 4.50 4.65 4.75 5.00 5.00 5.75
quantile(minwageNJ$wageAfter, probs = seq(from = 0, to = 1, by = 0.1))
## 0% 10% 20% 30% 40% 50% 60% 70% 80% 90% 100%
## 5.00 5.05 5.05 5.05 5.05 5.05 5.05 5.05 5.05 5.15 5.75
2.6.2 標準偏差
sqrt(mean((minwageNJ$fullPropAfter - minwageNJ$fullPropBefore)^2))
## [1] 0.3014669
mean(minwageNJ$fullPropAfter - minwageNJ$fullPropBefore)
## [1] 0.02387474
## 標準偏差
sd(minwageNJ$fullPropBefore)
## [1] 0.2304592
sd(minwageNJ$fullPropAfter)
## [1] 0.2510016
## 分散
var(minwageNJ$fullPropBefore)
## [1] 0.05311145
var(minwageNJ$fullPropAfter)
## [1] 0.0630018
テキスト
- 今井耕介 (2018)『社会科学のためのデータ分析入門
(上)』岩波書店。