はじめに

Imai (2018) の第3章を勉強していく。 なお、Llaudet and Imai (2022) は、より現代的なテキストである。

3.1 戦時における民間人の被害を測定する

この章は、Lyall, Blair, and Imai (2013)Blair, Imai, and Lyall (2014) の研究が元になっている。

library(readr)
afghan <- read_csv("afghan.csv")
summary(afghan$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   22.00   30.00   32.39   40.00   80.00
summary(afghan$educ.years)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   1.000   4.002   8.000  18.000
summary(afghan$employed)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  1.0000  0.5828  1.0000  1.0000
summary(afghan$income)
##    Length     Class      Mode 
##      2754 character character
barplot(table(afghan$income))

table(afghan$income)
## 
##   10,001-20,000    2,001-10,000   20,001-30,000 less than 2,000     over 30,000 
##             616            1420              93             457              14
prop.table(table(ISAF = afghan$violent.exp.ISAF, Taliban = afghan$violent.exp.taliban))
##     Taliban
## ISAF         0         1
##    0 0.4953445 0.1318436
##    1 0.1769088 0.1959032

3.2 Rで欠損データを扱う

head(afghan$income, n = 10)
##  [1] "2,001-10,000"  "2,001-10,000"  "2,001-10,000"  "2,001-10,000" 
##  [5] "2,001-10,000"  NA              "10,001-20,000" "2,001-10,000" 
##  [9] "2,001-10,000"  NA
head(is.na(afghan$income), n = 10)
##  [1] FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE  TRUE
sum(is.na(afghan$income))
## [1] 154
mean(is.na(afghan$income))
## [1] 0.05591866
x <- c(1, 2, 3, NA)
x
## [1]  1  2  3 NA
mean(x)
## [1] NA
mean(x, na.rm = TRUE)
## [1] 2
prop.table(table(ISAF = afghan$violent.exp.ISAF, Taliban = afghan$violent.exp.taliban, exclude = NULL))
##       Taliban
## ISAF             0           1        <NA>
##   0    0.482933914 0.128540305 0.007988381
##   1    0.172476398 0.190994916 0.007988381
##   <NA> 0.002541757 0.002904866 0.003631082
afghan.sub <- na.omit(afghan)
nrow(afghan.sub)
## [1] 2554
length(na.omit(afghan$income))
## [1] 2600

3.3 一変量の分布をビジュアル化する

3.3.1 棒グラフ

ISAF.ptable <- prop.table(table(ISAF = afghan$violent.exp.ISAF, exclude = NULL))

ISAF.ptable 
## ISAF
##           0           1        <NA> 
## 0.619462600 0.371459695 0.009077705
barplot(ISAF.ptable,
        names.arg = c("被害なし", "被害あり", "無回答"),
        main = "ISAFによる民間人被害",
        xlab = "回答カテゴリー",
        ylab = "回答者の割合", ylim = c(0, 0.7))

  • barplotで棒グラフを描画できる。
  • prop.table()で比率を計算。
  • table()でクロス表を作成。exclude = NULLにより、ベースカテゴリも含める。
Taliban.ptable <- prop.table(table(Taliban = afghan$violent.exp.taliban, exclude = NULL))

Taliban.ptable 
## Taliban
##          0          1       <NA> 
## 0.65795207 0.32244009 0.01960784
barplot(Taliban.ptable,
        names.arg = c("被害なし", "被害あり", "無回答"),
        main = "Talibanによる民間人被害",
        xlab = "回答カテゴリー",
        ylab = "回答者の割合", ylim = c(0, 0.7))

3.3.2 ヒストグラム

hist(afghan$age, freq = FALSE, 
     ylim = c(0, 0.04),
     xlab = "年齢",
     main = "回答者の年齢分布")

- lines(x, y)で、座標(x, y)を通る線を引ける。

hist(afghan$educ.years, freq = FALSE,
     breaks = seq(from = -0.5, to = 18.5, by = 1),
     xlab = "教育を受けた年数",
     main = "回答者の教育程度の分布")
text(x = 2.0, y = 0.5, "中央値")
abline(v = median(afghan$educ.years))

lines(x = rep(mean(afghan$educ.years), 2), y = c(0, 0.5))
text(x = 5.0, y = 0.5, "平均値")

- 箱ひげ図

boxplot(afghan$age, main = "年齢の分布", 
        ylab = "年齢", 
        ylim = c(10, 80))

boxplot(afghan$educ.years ~ afghan$province,
        main = "州別の教育程度",
        ylab = "教育を受けた年数")

- tapply(x, group, mean)で、グループ別の平均値を計算。 - na.rm = TRUEにより、欠損値を除く。

tapply(afghan$violent.exp.taliban, 
       afghan$province, mean, na.rm = TRUE)
##    Helmand      Khost      Kunar      Logar    Uruzgan 
## 0.50422195 0.23322684 0.30303030 0.08024691 0.45454545
tapply(afghan$violent.exp.ISAF, 
       afghan$province, mean, na.rm = TRUE)
##   Helmand     Khost     Kunar     Logar   Uruzgan 
## 0.5410226 0.2424242 0.3989899 0.1440329 0.4960422
  • png(file = "educ.png")dev.off()でグラフを保存。
png(file = "educ.png")
boxplot(afghan$educ.years ~ afghan$province,
        main = "州別の教育程度", 
        ylab = "教育を受けた年数")
dev.off()
## png 
##   2
  • par(mfrow = c(1, 2)により2つのグラフを1行2列の配置にする。
png(file = "hist.png")
# 2つのグラフを1行2列の配置
par(mfrow = c(1, 2), cex = 0.8)
# グラフ1
hist(afghan$age, freq = FALSE,
     xlab = "年齢", ylim = c(0, 0.04),
     main = "回答者の年齢分布")
# グラフ2
hist(afghan$educ.years, freq = FALSE,
     breaks = seq(from = -0.5, to = 18.5, by = 1),
     xlab = "教育を受けた年数", 
     xlim = c(0, 20),
     main = "回答者の教育程度分布")
dev.off()
## png 
##   2

3.4 標本調査

library(readr)
afghan_village <- read_csv("afghan-village.csv")

boxplot(afghan_village$altitude ~ afghan_village$village.surveyed,
        ylab = "標高(メートル)", names = c("非抽出", "抽出"))

boxplot(log(afghan_village$population) ~ afghan_village$village.surveyed,
        ylab = "対数人口", names = c("非抽出", "抽出"))

tapply(is.na(afghan$violent.exp.taliban), afghan$province, mean)
##     Helmand       Khost       Kunar       Logar     Uruzgan 
## 0.030409357 0.006349206 0.000000000 0.000000000 0.062015504
tapply(is.na(afghan$violent.exp.ISAF), afghan$province, mean)
##     Helmand       Khost       Kunar       Logar     Uruzgan 
## 0.016374269 0.004761905 0.000000000 0.000000000 0.020671835
mean(afghan$list.response[afghan$list.group == "ISAF"]) -
  mean(afghan$list.response[afghan$list.group == "control"]) 
## [1] 0.04901961
table(response = afghan$list.response,
      group = afghan$list.group)
##         group
## response control ISAF taliban
##        0     188  174       0
##        1     265  278     433
##        2     265  260     287
##        3     200  182     198
##        4       0   24       0
library(readr)
congress <- read_csv("congress.csv")

rep <- subset(congress, subset = (party == "Republican"))

dem <- subset(congress, subset = (party == "Democrat"))


rep80 <- subset(rep, subset = (congress == 80))

dem80 <- subset(dem, subset = (congress == 80))

rep112 <- subset(rep, subset = (congress == 112))

dem112 <- subset(dem, subset = (congress == 112))
xlab <- "経済的リベラル/保守"
ylab <- "人種的リベラル/保守"
lim <- c(-1.5, 1.5)
example(points)
## 
## points> require(stats) # for rnorm
## 
## points> plot(-4:4, -4:4, type = "n")  # setting up coord. system

## 
## points> points(rnorm(200), rnorm(200), col = "red")
## 
## points> points(rnorm(100)/2, rnorm(100)/2, col = "blue", cex = 1.5)
## 
## points> op <- par(bg = "light blue")
## 
## points> x <- seq(0, 2*pi, length.out = 51)
## 
## points> ## something "between type='b' and type='o'":
## points> plot(x, sin(x), type = "o", pch = 21, bg = par("bg"), col = "blue", cex = .6,
## points+  main = 'plot(..., type="o", pch=21, bg=par("bg"))')

## 
## points> par(op)
## 
## points> ## Not run: 
## points> ##D ## The figure was produced by calls like
## points> ##D png("pch.png", height = 0.7, width = 7, res = 100, units = "in")
## points> ##D par(mar = rep(0,4))
## points> ##D plot(c(-1, 26), 0:1, type = "n", axes = FALSE)
## points> ##D text(0:25, 0.6, 0:25, cex = 0.5)
## points> ##D points(0:25, rep(0.3, 26), pch = 0:25, bg = "grey")
## points> ## End(Not run)
## points> 
## points> ##-------- Showing all the extra & some char graphics symbols ---------
## points> pchShow <-
## points+   function(extras = c("*",".", "o","O","0","+","-","|","%","#"),
## points+            cex = 3, ## good for both .Device=="postscript" and "x11"
## points+            col = "red3", bg = "gold", coltext = "brown", cextext = 1.2,
## points+            main = paste("plot symbols :  points (...  pch = *, cex =",
## points+                         cex,")"))
## points+   {
## points+     nex <- length(extras)
## points+     np  <- 26 + nex
## points+     ipch <- 0:(np-1)
## points+     k <- floor(sqrt(np))
## points+     dd <- c(-1,1)/2
## points+     rx <- dd + range(ix <- ipch %/% k)
## points+     ry <- dd + range(iy <- 3 + (k-1)- ipch %% k)
## points+     pch <- as.list(ipch) # list with integers & strings
## points+     if(nex > 0) pch[26+ 1:nex] <- as.list(extras)
## points+     plot(rx, ry, type = "n", axes  =  FALSE, xlab = "", ylab = "", main = main)
## points+     abline(v = ix, h = iy, col = "lightgray", lty = "dotted")
## points+     for(i in 1:np) {
## points+       pc <- pch[[i]]
## points+       ## 'col' symbols with a 'bg'-colored interior (where available) :
## points+       points(ix[i], iy[i], pch = pc, col = col, bg = bg, cex = cex)
## points+       if(cextext > 0)
## points+           text(ix[i] - 0.3, iy[i], pc, col = coltext, cex = cextext)
## points+     }
## points+   }
## 
## points> pchShow()

## 
## points> pchShow(c("o","O","0"), cex = 2.5)

## 
## points> pchShow(NULL, cex = 4, cextext = 0, main = NULL)

## 
## points> ## No test: 
## points> ##D ## ------------ test code for various pch specifications -------------
## points> ##D # Try this in various font families (including Hershey)
## points> ##D # and locales.  Use sign = -1 asserts we want Latin-1.
## points> ##D # Standard cases in a MBCS locale will not plot the top half.
## points> ##D TestChars <- function(sign = 1, font = 1, ...)
## points> ##D {
## points> ##D    MB <- l10n_info()$MBCS
## points> ##D    r <- if(font == 5) { sign <- 1; c(32:126, 160:254)
## points> ##D        } else if(MB) 32:126 else 32:255
## points> ##D    if (sign == -1) r <- c(32:126, 160:255)
## points> ##D    par(pty = "s")
## points> ##D    plot(c(-1,16), c(-1,16), type = "n", xlab = "", ylab = "",
## points> ##D         xaxs = "i", yaxs = "i",
## points> ##D         main = sprintf("sign = %d, font = %d", sign, font))
## points> ##D    grid(17, 17, lty = 1) ; mtext(paste("MBCS:", MB))
## points> ##D    for(i in r) try(points(i%%16, i%/%16, pch = sign*i, font = font,...))
## points> ##D }
## points> ##D TestChars()
## points> ##D try(TestChars(sign = -1))
## points> ##D TestChars(font = 5)  # Euro might be at 160 (0+10*16).
## points> ##D                      # macOS has apple at 240 (0+15*16).
## points> ##D try(TestChars(-1, font = 2))  # bold
## points> ## End(No test)
## points> 
## points>
plot(dem80$dwnom1, dem80$dwnom2, pch = 16, col = "blue",
     xlim = lim, ylim = lim, xlab = xlab, ylab = ylab,
     main = "第80議会") #民主党

points(rep80$dwnom1, rep80$dwnom2, pch = 17, col = "red") #共和党

text(-0.75, 1, "民主党")
text(1, -1, "共和党")

plot(dem112$dwnom1, dem112$dwnom2, pch = 16, col = "blue",
     xlim = lim, ylim = lim, xlab = xlab, ylab = ylab,
     main = "第112議会") #民主党

points(rep112$dwnom1, rep112$dwnom2, pch = 17, col = "red") #共和党

text(-0.75, 1, "民主党")
text(1, -1, "共和党")

tapply関数

dem.median <- tapply(dem$dwnom1, dem$congress, median)
dem.median
##      80      81      82      83      84      85      86      87      88      89 
## -0.1265 -0.2070 -0.1790 -0.1740 -0.2225 -0.2305 -0.2590 -0.2580 -0.2850 -0.2915 
##      90      91      92      93      94      95      96      97      98      99 
## -0.3040 -0.3140 -0.3175 -0.3290 -0.3140 -0.3080 -0.3125 -0.3100 -0.3045 -0.3140 
##     100     101     102     103     104     105     106     107     108     109 
## -0.3190 -0.3195 -0.3200 -0.3330 -0.3860 -0.3820 -0.3800 -0.3860 -0.3780 -0.3815 
##     110     111     112 
## -0.3670 -0.3455 -0.3980
rep.median <- tapply(rep$dwnom1, rep$congress, median)
rep.median
##     80     81     82     83     84     85     86     87     88     89     90 
## 0.2660 0.2620 0.2610 0.2575 0.2510 0.2450 0.2425 0.2310 0.2390 0.2310 0.2240 
##     91     92     93     94     95     96     97     98     99    100    101 
## 0.2170 0.2145 0.2160 0.2080 0.2100 0.2345 0.2550 0.2850 0.3030 0.3305 0.3370 
##    102    103    104    105    106    107    108    109    110    111    112 
## 0.3605 0.4080 0.4550 0.4820 0.4990 0.5265 0.5540 0.5825 0.6180 0.6550 0.6740
plot(names(dem.median), dem.median, col = "blue", type = "l", 
     xlim = c(80, 115), ylim = c(-1, 1), xlab = "議会会期", 
     ylab = "DW-NOMINATEスコア(第1次元)")

lines(names(rep.median), rep.median, col= "red")

text(110, -0.6, "民主党")
text(110, 0.85, "共和党")

3.6.2 相関

library(readr)
gini <- read_csv("USGini.csv")

plot(seq(from = 1947.5, to = 2011.5, by = 2),
     rep.median - dem.median,
     xlab = "年", ylab = "共和党中央値-民主党中央値",
     main = "政治的分極化")

plot(gini$year, gini$gini, ylim = c(0.35, 0.45),
     xlab = "年", ylab = "ジニ係数", 
     main = "所得の不平等")

cor(gini$gini[seq(from = 2, to = nrow(gini), by = 2)],
    rep.median - dem.median)
## [1] 0.9418128
hist(dem112$dwnom2, freq = FALSE, main = "民主党",
     xlim = c(-1.5, 1.5), ylim = c(0, 1.75),
     xlab = "人種的リベラル/保守の次元")

hist(rep112$dwnom2, freq = FALSE, main = "共和党",
     xlim = c(-1.5, 1.5), ylim = c(0, 1.75),
     xlab = "人種的リベラル/保守の次元")

qqplot(dem112$dwnom2, rep112$dwnom2, xlab = "民主党",
       ylab = "共和党", xlim = c(-1.5, 1.5),
       ylim = c(-1.5, 1.5),
       main = "人種的リベラル/保守の次元")
abline(0, 1)

# クラスター化

x <- matrix(1:12, nrow = 3, ncol = 4, byrow = TRUE)
  
rownames(x) <- c("a", "b", "c")
colnames(x) <- c("d", "e", "f", "g")
dim(x)
## [1] 3 4
x
##   d  e  f  g
## a 1  2  3  4
## b 5  6  7  8
## c 9 10 11 12
y <- data.frame(y1 = as.factor(c("a", "b", "c")),
               y2 = c(0.1, 0.2, 0.3))

class(y$y1)
## [1] "factor"
class(y$y2)
## [1] "numeric"
z <- as.matrix(y)

z
##      y1  y2   
## [1,] "a" "0.1"
## [2,] "b" "0.2"
## [3,] "c" "0.3"
colSums(x)
##  d  e  f  g 
## 15 18 21 24
rowMeans(x)
##    a    b    c 
##  2.5  6.5 10.5
apply(x, 2, sum)
##  d  e  f  g 
## 15 18 21 24
apply(x, 1, mean)
##    a    b    c 
##  2.5  6.5 10.5
apply(x, 1, sd)
##        a        b        c 
## 1.290994 1.290994 1.290994
x <- list(y1 = 1:10, y2 = c("hi", "hello", "hey"),
          y3 = data.frame(z1 = 1:3, z2 = c("good", "bad", "ugly")))

x$y1
##  [1]  1  2  3  4  5  6  7  8  9 10
x[[2]]
## [1] "hi"    "hello" "hey"
x[["y3"]]
##   z1   z2
## 1  1 good
## 2  2  bad
## 3  3 ugly
names(x)
## [1] "y1" "y2" "y3"
length(x)
## [1] 3
dwnom80 <- cbind(congress$dwnom1[congress$congress == 80], 
                 congress$dwnom2[congress$congress == 80])


dwnom112 <- cbind(congress$dwnom1[congress$congress == 112], 
                 congress$dwnom2[congress$congress == 112])
k80two.out <- kmeans(dwnom80, centers = 2, nstart =5)

k112two.out <- kmeans(dwnom112, centers = 2, nstart =5)
names(k80two.out)
## [1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss"
## [6] "betweenss"    "size"         "iter"         "ifault"
k80two.out$centers
##          [,1]       [,2]
## 1  0.14681029 -0.3389293
## 2 -0.04843704  0.7827259
k112two.out$centers
##         [,1]       [,2]
## 1  0.6776736 0.09061157
## 2 -0.3912687 0.03260696
table(party = congress$party[congress$congress == 80],
      cluster = k80two.out$cluster)
##             cluster
## party          1   2
##   Democrat    62 132
##   Other        2   0
##   Republican 247   3
table(party = congress$party[congress$congress == 112],
      cluster = k112two.out$cluster)
##             cluster
## party          1   2
##   Democrat     0 200
##   Republican 242   1
k80four.out <- kmeans(dwnom80, centers = 4, nstart = 5)
k112four.out <- kmeans(dwnom112, centers = 4, nstart = 5)
plot(dwnom80, col = k80four.out$cluster + 1, 
     xlab = xlab, ylab = ylab, xlim = lim, ylim = lim, main = "第80議会")
points(k80four.out$centers, pch = 8, cex = 2)

plot(dwnom112, col = k112four.out$cluster + 1, 
     xlab = xlab, ylab = ylab, xlim = lim, ylim = lim, main = "第112議会")
points(k112four.out$centers, pch = 8, cex = 2)

p <- palette()
p
## [1] "black"   "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"

参考文献

Blair, Graeme, Kosuke Imai, and Jason Lyall. 2014. “Comparing and Combining List and Endorsement Experiments: Evidence from Afghanistan.” American Journal of Political Science 58 (4): 1043–63.
Imai, Kosuke. 2018. Quantitative Social Science: An Introduction. Princeton University Press.
Llaudet, Elena, and Kosuke Imai. 2022. Data Analysis for Social Science: A Friendly and Practical Introduction. Princeton University Press.
Lyall, Jason, Graeme Blair, and Kosuke Imai. 2013. “Explaining Support for Combatants During Wartime: A Survey Experiment in Afghanistan.” American Political Science Review 107 (4): 679–705.