この章は、Lyall, Blair, and Imai (2013) と Blair, Imai, and Lyall (2014) の研究が元になっている。
library(readr)
afghan <- read_csv("afghan.csv")
summary(afghan$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.00 22.00 30.00 32.39 40.00 80.00
summary(afghan$educ.years)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 1.000 4.002 8.000 18.000
summary(afghan$employed)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 1.0000 0.5828 1.0000 1.0000
summary(afghan$income)
## Length Class Mode
## 2754 character character
barplot(table(afghan$income))
table(afghan$income)
##
## 10,001-20,000 2,001-10,000 20,001-30,000 less than 2,000 over 30,000
## 616 1420 93 457 14
prop.table(table(ISAF = afghan$violent.exp.ISAF, Taliban = afghan$violent.exp.taliban))
## Taliban
## ISAF 0 1
## 0 0.4953445 0.1318436
## 1 0.1769088 0.1959032
head(afghan$income, n = 10)
## [1] "2,001-10,000" "2,001-10,000" "2,001-10,000" "2,001-10,000"
## [5] "2,001-10,000" NA "10,001-20,000" "2,001-10,000"
## [9] "2,001-10,000" NA
head(is.na(afghan$income), n = 10)
## [1] FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE TRUE
sum(is.na(afghan$income))
## [1] 154
mean(is.na(afghan$income))
## [1] 0.05591866
x <- c(1, 2, 3, NA)
x
## [1] 1 2 3 NA
mean(x)
## [1] NA
mean(x, na.rm = TRUE)
## [1] 2
prop.table(table(ISAF = afghan$violent.exp.ISAF, Taliban = afghan$violent.exp.taliban, exclude = NULL))
## Taliban
## ISAF 0 1 <NA>
## 0 0.482933914 0.128540305 0.007988381
## 1 0.172476398 0.190994916 0.007988381
## <NA> 0.002541757 0.002904866 0.003631082
afghan.sub <- na.omit(afghan)
nrow(afghan.sub)
## [1] 2554
length(na.omit(afghan$income))
## [1] 2600
ISAF.ptable <- prop.table(table(ISAF = afghan$violent.exp.ISAF, exclude = NULL))
ISAF.ptable
## ISAF
## 0 1 <NA>
## 0.619462600 0.371459695 0.009077705
barplot(ISAF.ptable,
names.arg = c("被害なし", "被害あり", "無回答"),
main = "ISAFによる民間人被害",
xlab = "回答カテゴリー",
ylab = "回答者の割合", ylim = c(0, 0.7))
barplot
で棒グラフを描画できる。prop.table()
で比率を計算。table()
でクロス表を作成。exclude = NULL
により、ベースカテゴリも含める。Taliban.ptable <- prop.table(table(Taliban = afghan$violent.exp.taliban, exclude = NULL))
Taliban.ptable
## Taliban
## 0 1 <NA>
## 0.65795207 0.32244009 0.01960784
barplot(Taliban.ptable,
names.arg = c("被害なし", "被害あり", "無回答"),
main = "Talibanによる民間人被害",
xlab = "回答カテゴリー",
ylab = "回答者の割合", ylim = c(0, 0.7))
hist(afghan$age, freq = FALSE,
ylim = c(0, 0.04),
xlab = "年齢",
main = "回答者の年齢分布")
-
lines(x, y)
で、座標(x, y)を通る線を引ける。
hist(afghan$educ.years, freq = FALSE,
breaks = seq(from = -0.5, to = 18.5, by = 1),
xlab = "教育を受けた年数",
main = "回答者の教育程度の分布")
text(x = 2.0, y = 0.5, "中央値")
abline(v = median(afghan$educ.years))
lines(x = rep(mean(afghan$educ.years), 2), y = c(0, 0.5))
text(x = 5.0, y = 0.5, "平均値")
- 箱ひげ図
boxplot(afghan$age, main = "年齢の分布",
ylab = "年齢",
ylim = c(10, 80))
boxplot(afghan$educ.years ~ afghan$province,
main = "州別の教育程度",
ylab = "教育を受けた年数")
-
tapply(x, group, mean)
で、グループ別の平均値を計算。 -
na.rm = TRUE
により、欠損値を除く。
tapply(afghan$violent.exp.taliban,
afghan$province, mean, na.rm = TRUE)
## Helmand Khost Kunar Logar Uruzgan
## 0.50422195 0.23322684 0.30303030 0.08024691 0.45454545
tapply(afghan$violent.exp.ISAF,
afghan$province, mean, na.rm = TRUE)
## Helmand Khost Kunar Logar Uruzgan
## 0.5410226 0.2424242 0.3989899 0.1440329 0.4960422
png(file = "educ.png")
とdev.off()
でグラフを保存。png(file = "educ.png")
boxplot(afghan$educ.years ~ afghan$province,
main = "州別の教育程度",
ylab = "教育を受けた年数")
dev.off()
## png
## 2
par(mfrow = c(1, 2)
により2つのグラフを1行2列の配置にする。png(file = "hist.png")
# 2つのグラフを1行2列の配置
par(mfrow = c(1, 2), cex = 0.8)
# グラフ1
hist(afghan$age, freq = FALSE,
xlab = "年齢", ylim = c(0, 0.04),
main = "回答者の年齢分布")
# グラフ2
hist(afghan$educ.years, freq = FALSE,
breaks = seq(from = -0.5, to = 18.5, by = 1),
xlab = "教育を受けた年数",
xlim = c(0, 20),
main = "回答者の教育程度分布")
dev.off()
## png
## 2
library(readr)
afghan_village <- read_csv("afghan-village.csv")
boxplot(afghan_village$altitude ~ afghan_village$village.surveyed,
ylab = "標高(メートル)", names = c("非抽出", "抽出"))
boxplot(log(afghan_village$population) ~ afghan_village$village.surveyed,
ylab = "対数人口", names = c("非抽出", "抽出"))
tapply(is.na(afghan$violent.exp.taliban), afghan$province, mean)
## Helmand Khost Kunar Logar Uruzgan
## 0.030409357 0.006349206 0.000000000 0.000000000 0.062015504
tapply(is.na(afghan$violent.exp.ISAF), afghan$province, mean)
## Helmand Khost Kunar Logar Uruzgan
## 0.016374269 0.004761905 0.000000000 0.000000000 0.020671835
mean(afghan$list.response[afghan$list.group == "ISAF"]) -
mean(afghan$list.response[afghan$list.group == "control"])
## [1] 0.04901961
table(response = afghan$list.response,
group = afghan$list.group)
## group
## response control ISAF taliban
## 0 188 174 0
## 1 265 278 433
## 2 265 260 287
## 3 200 182 198
## 4 0 24 0
library(readr)
congress <- read_csv("congress.csv")
rep <- subset(congress, subset = (party == "Republican"))
dem <- subset(congress, subset = (party == "Democrat"))
rep80 <- subset(rep, subset = (congress == 80))
dem80 <- subset(dem, subset = (congress == 80))
rep112 <- subset(rep, subset = (congress == 112))
dem112 <- subset(dem, subset = (congress == 112))
xlab <- "経済的リベラル/保守"
ylab <- "人種的リベラル/保守"
lim <- c(-1.5, 1.5)
example(points)
##
## points> require(stats) # for rnorm
##
## points> plot(-4:4, -4:4, type = "n") # setting up coord. system
##
## points> points(rnorm(200), rnorm(200), col = "red")
##
## points> points(rnorm(100)/2, rnorm(100)/2, col = "blue", cex = 1.5)
##
## points> op <- par(bg = "light blue")
##
## points> x <- seq(0, 2*pi, length.out = 51)
##
## points> ## something "between type='b' and type='o'":
## points> plot(x, sin(x), type = "o", pch = 21, bg = par("bg"), col = "blue", cex = .6,
## points+ main = 'plot(..., type="o", pch=21, bg=par("bg"))')
##
## points> par(op)
##
## points> ## Not run:
## points> ##D ## The figure was produced by calls like
## points> ##D png("pch.png", height = 0.7, width = 7, res = 100, units = "in")
## points> ##D par(mar = rep(0,4))
## points> ##D plot(c(-1, 26), 0:1, type = "n", axes = FALSE)
## points> ##D text(0:25, 0.6, 0:25, cex = 0.5)
## points> ##D points(0:25, rep(0.3, 26), pch = 0:25, bg = "grey")
## points> ## End(Not run)
## points>
## points> ##-------- Showing all the extra & some char graphics symbols ---------
## points> pchShow <-
## points+ function(extras = c("*",".", "o","O","0","+","-","|","%","#"),
## points+ cex = 3, ## good for both .Device=="postscript" and "x11"
## points+ col = "red3", bg = "gold", coltext = "brown", cextext = 1.2,
## points+ main = paste("plot symbols : points (... pch = *, cex =",
## points+ cex,")"))
## points+ {
## points+ nex <- length(extras)
## points+ np <- 26 + nex
## points+ ipch <- 0:(np-1)
## points+ k <- floor(sqrt(np))
## points+ dd <- c(-1,1)/2
## points+ rx <- dd + range(ix <- ipch %/% k)
## points+ ry <- dd + range(iy <- 3 + (k-1)- ipch %% k)
## points+ pch <- as.list(ipch) # list with integers & strings
## points+ if(nex > 0) pch[26+ 1:nex] <- as.list(extras)
## points+ plot(rx, ry, type = "n", axes = FALSE, xlab = "", ylab = "", main = main)
## points+ abline(v = ix, h = iy, col = "lightgray", lty = "dotted")
## points+ for(i in 1:np) {
## points+ pc <- pch[[i]]
## points+ ## 'col' symbols with a 'bg'-colored interior (where available) :
## points+ points(ix[i], iy[i], pch = pc, col = col, bg = bg, cex = cex)
## points+ if(cextext > 0)
## points+ text(ix[i] - 0.3, iy[i], pc, col = coltext, cex = cextext)
## points+ }
## points+ }
##
## points> pchShow()
##
## points> pchShow(c("o","O","0"), cex = 2.5)
##
## points> pchShow(NULL, cex = 4, cextext = 0, main = NULL)
##
## points> ## No test:
## points> ##D ## ------------ test code for various pch specifications -------------
## points> ##D # Try this in various font families (including Hershey)
## points> ##D # and locales. Use sign = -1 asserts we want Latin-1.
## points> ##D # Standard cases in a MBCS locale will not plot the top half.
## points> ##D TestChars <- function(sign = 1, font = 1, ...)
## points> ##D {
## points> ##D MB <- l10n_info()$MBCS
## points> ##D r <- if(font == 5) { sign <- 1; c(32:126, 160:254)
## points> ##D } else if(MB) 32:126 else 32:255
## points> ##D if (sign == -1) r <- c(32:126, 160:255)
## points> ##D par(pty = "s")
## points> ##D plot(c(-1,16), c(-1,16), type = "n", xlab = "", ylab = "",
## points> ##D xaxs = "i", yaxs = "i",
## points> ##D main = sprintf("sign = %d, font = %d", sign, font))
## points> ##D grid(17, 17, lty = 1) ; mtext(paste("MBCS:", MB))
## points> ##D for(i in r) try(points(i%%16, i%/%16, pch = sign*i, font = font,...))
## points> ##D }
## points> ##D TestChars()
## points> ##D try(TestChars(sign = -1))
## points> ##D TestChars(font = 5) # Euro might be at 160 (0+10*16).
## points> ##D # macOS has apple at 240 (0+15*16).
## points> ##D try(TestChars(-1, font = 2)) # bold
## points> ## End(No test)
## points>
## points>
plot(dem80$dwnom1, dem80$dwnom2, pch = 16, col = "blue",
xlim = lim, ylim = lim, xlab = xlab, ylab = ylab,
main = "第80議会") #民主党
points(rep80$dwnom1, rep80$dwnom2, pch = 17, col = "red") #共和党
text(-0.75, 1, "民主党")
text(1, -1, "共和党")
plot(dem112$dwnom1, dem112$dwnom2, pch = 16, col = "blue",
xlim = lim, ylim = lim, xlab = xlab, ylab = ylab,
main = "第112議会") #民主党
points(rep112$dwnom1, rep112$dwnom2, pch = 17, col = "red") #共和党
text(-0.75, 1, "民主党")
text(1, -1, "共和党")
tapply
関数
dem.median <- tapply(dem$dwnom1, dem$congress, median)
dem.median
## 80 81 82 83 84 85 86 87 88 89
## -0.1265 -0.2070 -0.1790 -0.1740 -0.2225 -0.2305 -0.2590 -0.2580 -0.2850 -0.2915
## 90 91 92 93 94 95 96 97 98 99
## -0.3040 -0.3140 -0.3175 -0.3290 -0.3140 -0.3080 -0.3125 -0.3100 -0.3045 -0.3140
## 100 101 102 103 104 105 106 107 108 109
## -0.3190 -0.3195 -0.3200 -0.3330 -0.3860 -0.3820 -0.3800 -0.3860 -0.3780 -0.3815
## 110 111 112
## -0.3670 -0.3455 -0.3980
rep.median <- tapply(rep$dwnom1, rep$congress, median)
rep.median
## 80 81 82 83 84 85 86 87 88 89 90
## 0.2660 0.2620 0.2610 0.2575 0.2510 0.2450 0.2425 0.2310 0.2390 0.2310 0.2240
## 91 92 93 94 95 96 97 98 99 100 101
## 0.2170 0.2145 0.2160 0.2080 0.2100 0.2345 0.2550 0.2850 0.3030 0.3305 0.3370
## 102 103 104 105 106 107 108 109 110 111 112
## 0.3605 0.4080 0.4550 0.4820 0.4990 0.5265 0.5540 0.5825 0.6180 0.6550 0.6740
plot(names(dem.median), dem.median, col = "blue", type = "l",
xlim = c(80, 115), ylim = c(-1, 1), xlab = "議会会期",
ylab = "DW-NOMINATEスコア(第1次元)")
lines(names(rep.median), rep.median, col= "red")
text(110, -0.6, "民主党")
text(110, 0.85, "共和党")
library(readr)
gini <- read_csv("USGini.csv")
plot(seq(from = 1947.5, to = 2011.5, by = 2),
rep.median - dem.median,
xlab = "年", ylab = "共和党中央値-民主党中央値",
main = "政治的分極化")
plot(gini$year, gini$gini, ylim = c(0.35, 0.45),
xlab = "年", ylab = "ジニ係数",
main = "所得の不平等")
cor(gini$gini[seq(from = 2, to = nrow(gini), by = 2)],
rep.median - dem.median)
## [1] 0.9418128
hist(dem112$dwnom2, freq = FALSE, main = "民主党",
xlim = c(-1.5, 1.5), ylim = c(0, 1.75),
xlab = "人種的リベラル/保守の次元")
hist(rep112$dwnom2, freq = FALSE, main = "共和党",
xlim = c(-1.5, 1.5), ylim = c(0, 1.75),
xlab = "人種的リベラル/保守の次元")
qqplot(dem112$dwnom2, rep112$dwnom2, xlab = "民主党",
ylab = "共和党", xlim = c(-1.5, 1.5),
ylim = c(-1.5, 1.5),
main = "人種的リベラル/保守の次元")
abline(0, 1)
# クラスター化
x <- matrix(1:12, nrow = 3, ncol = 4, byrow = TRUE)
rownames(x) <- c("a", "b", "c")
colnames(x) <- c("d", "e", "f", "g")
dim(x)
## [1] 3 4
x
## d e f g
## a 1 2 3 4
## b 5 6 7 8
## c 9 10 11 12
y <- data.frame(y1 = as.factor(c("a", "b", "c")),
y2 = c(0.1, 0.2, 0.3))
class(y$y1)
## [1] "factor"
class(y$y2)
## [1] "numeric"
z <- as.matrix(y)
z
## y1 y2
## [1,] "a" "0.1"
## [2,] "b" "0.2"
## [3,] "c" "0.3"
colSums(x)
## d e f g
## 15 18 21 24
rowMeans(x)
## a b c
## 2.5 6.5 10.5
apply(x, 2, sum)
## d e f g
## 15 18 21 24
apply(x, 1, mean)
## a b c
## 2.5 6.5 10.5
apply(x, 1, sd)
## a b c
## 1.290994 1.290994 1.290994
x <- list(y1 = 1:10, y2 = c("hi", "hello", "hey"),
y3 = data.frame(z1 = 1:3, z2 = c("good", "bad", "ugly")))
x$y1
## [1] 1 2 3 4 5 6 7 8 9 10
x[[2]]
## [1] "hi" "hello" "hey"
x[["y3"]]
## z1 z2
## 1 1 good
## 2 2 bad
## 3 3 ugly
names(x)
## [1] "y1" "y2" "y3"
length(x)
## [1] 3
dwnom80 <- cbind(congress$dwnom1[congress$congress == 80],
congress$dwnom2[congress$congress == 80])
dwnom112 <- cbind(congress$dwnom1[congress$congress == 112],
congress$dwnom2[congress$congress == 112])
k80two.out <- kmeans(dwnom80, centers = 2, nstart =5)
k112two.out <- kmeans(dwnom112, centers = 2, nstart =5)
names(k80two.out)
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
k80two.out$centers
## [,1] [,2]
## 1 0.14681029 -0.3389293
## 2 -0.04843704 0.7827259
k112two.out$centers
## [,1] [,2]
## 1 0.6776736 0.09061157
## 2 -0.3912687 0.03260696
table(party = congress$party[congress$congress == 80],
cluster = k80two.out$cluster)
## cluster
## party 1 2
## Democrat 62 132
## Other 2 0
## Republican 247 3
table(party = congress$party[congress$congress == 112],
cluster = k112two.out$cluster)
## cluster
## party 1 2
## Democrat 0 200
## Republican 242 1
k80four.out <- kmeans(dwnom80, centers = 4, nstart = 5)
k112four.out <- kmeans(dwnom112, centers = 4, nstart = 5)
plot(dwnom80, col = k80four.out$cluster + 1,
xlab = xlab, ylab = ylab, xlim = lim, ylim = lim, main = "第80議会")
points(k80four.out$centers, pch = 8, cex = 2)
plot(dwnom112, col = k112four.out$cluster + 1,
xlab = xlab, ylab = ylab, xlim = lim, ylim = lim, main = "第112議会")
points(k112four.out$centers, pch = 8, cex = 2)
p <- palette()
p
## [1] "black" "#DF536B" "#61D04F" "#2297E6" "#28E2E5" "#CD0BBC" "#F5C710"
## [8] "gray62"