TokyoR#42 テキスト (Rで学ぶデータマイニングⅠ 第１８章)

library(ggplot2)
library(scales)

交通事故死者数データ

データ作成

death <- data.frame(
  year=7:17,
  shisha=c(10679,9942,9640,9211,9006,9066,8747,8326,7702,7358,6871))

事故発生後24時間いないの交通事故者数1

ggplot(subset(death, year>=13), aes(x=year, y=shisha-6000)) +
  geom_bar(stat="identity", col="black", fill="grey") +
  scale_y_continuous(
    breaks = seq(0, 4000, by = 1000),
    labels = comma(seq(6000,10000, by=1000))
  ) +
  xlab("") +
  ylab("人数") +
  ggtitle("事故発生後24時間いないの交通事故者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-3

事故発生後24時間いないの交通事故者数2

ggplot(subset(death, year>=13), aes(x=year, y=shisha)) +
  geom_bar(stat="identity", col="black", fill="grey") +
  scale_y_continuous(label=comma) +
  xlab("") +
  ylab("人数") +
  ggtitle("事故発生後24時間いないの交通事故者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-4

事故発生後24時間いないの交通事故者数3

ggplot(death, aes(x=year, y=shisha)) +
  geom_bar(col="black", fill="grey", stat="identity") +
  scale_y_continuous(label=comma) +
  xlab("") +
  ylab("人数") +
  ggtitle("事故発生後24時間いないの交通事故者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-5

事故発生後24時間いないの交通事故者数4

ggplot(death, aes(x=year, y=shisha)) +
  geom_line() +
  geom_point(shape=1, size=2) +
  scale_y_continuous(label=comma) +
  xlab("") +
  ylab("人数") +
  ggtitle("事故発生後24時間いないの交通事故者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-6

事故発生後24時間いないの交通事故者数5

ggplot(death, aes(x=year, y=shisha)) +
  geom_bar(col="black", fill="grey", stat="identity") +
  scale_y_continuous(label=comma) +
  xlab("") +
  ylab("") +
  ggtitle("事故発生後24時間いないの交通事故者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-7

飲酒有無別交通事故死亡者数推移

library(reshape2)

death2 <- read.csv("death2.csv", fileEncoding="shift_jis")
death2_melt <- melt(death2, id.vars = "year")

ggplot(death2_melt, aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("人数") +
  ggtitle("飲酒有無別交通事故死亡者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-8

ggplot(subset(death2_melt,variable!="交通事故死亡者数"), aes(x=as.factor(year), y=value, fill=variable, group=variable)) +
  geom_bar(stat="identity",position="fill") +
  scale_y_continuous(label=percent) +
  xlab("") +
  ylab("割合") +
  ggtitle("飲酒有無別交通事故死亡者数比率") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-8

飲酒有交通事故死亡者数推移

ggplot(subset(death2_melt, variable=="飲酒あり.第１当事者."), aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("人数") +
  ggtitle("飲酒有交通事故死亡者数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-9

改正による交通事故死亡者数減少効果

death2_2 <- death2[,c(1,3,4)]
death2_2$baseline <- mean(death2_2[1:7,3] / death2_2[1:7,2]) * death2_2[,2]
death2_melt2 <- melt(death2_2[,c(1,3,4)], id.vars = "year")
death2_melt2$variable <- as.character(death2_melt2$variable)
death2_melt2$variable <- ifelse(death2_melt2$variable=="baseline", "改正なし想定", death2_melt2$variable)

ggplot(death2_melt2, aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("人数") +
  ggtitle("改正による交通事故死亡者数減少効果") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-10

death2_melt3 <- rbind(death2_melt2, subset(death2_melt, variable=="飲酒なし.第１当事者."))


ggplot(death2_melt3, aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("人数") +
  ggtitle("改正による交通事故死亡者数減少効果") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-10

### 飲酒有無別交通事故数 推移

jiko <- read.csv("inshu_jiko.csv", fileEncoding="shift_jis")
jiko_melt <- melt(jiko, id.vars = "year")

ggplot(jiko_melt, aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("事故数") +
  ggtitle("飲酒有無別交通事故数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-11

飲酒有交通事故数推移

ggplot(subset(jiko_melt, variable=="飲酒あり"), aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("人数") +
  ggtitle("飲酒有交通事故数") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-12

飲酒有無別死亡事故率推移

death_rate <- read.csv("death_rate.csv", fileEncoding="shift_jis")
death_rate_melt <- melt(death_rate, id.vars = "year")

ggplot(death_rate_melt, aes(x=as.factor(year), y=value, col=variable, shape=variable, group=variable)) +
  geom_line() +
  geom_point() +
  scale_y_continuous(label=comma, limits=c(0,NA)) +
  xlab("") +
  ylab("死亡事故率") +
  ggtitle("飲酒有無別死亡事故率（%）") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-13

death.rate <- data.frame(
  year=8:17,
  rate=(death$shisha[-1] - death$shisha[-nrow(death)]) / death$shisha[-1]
  )

ggplot(death.rate, aes(x=year, y=rate)) +
  geom_line() +
  geom_point(shape=1, size=2) +
  geom_abline(intercept=0, slope=0, lty=2) +
  ggtitle("事故発生後24時間いないの交通事故者数の増減率") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-15

death.rate$rate.scaled <- scale(death.rate$rate)

ggplot(death.rate, aes(x=year, y=rate.scaled)) +
  geom_line() +
  geom_point(shape=1, size=2) +
  geom_abline(intercept=0, slope=0, lty=2) +
  geom_abline(intercept=1.96, slope=0, lty=2) +
  geom_abline(intercept=-1.96, slope=0, lty=2) +
  ylim(-2.2, 2.2) +
  ggtitle("基準化された増減率") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-17

death.rate$gmean <- 
  exp(sum(log(1+death.rate$rate))/length(death.rate$rate))

death.rate$rate.gscaled <- 
  (death.rate$rate+1 - death.rate$gmean) / sqrt(sum((death.rate$rate+1 - death.rate$gmean)^2)/(nrow(death.rate)-1))

ggplot(death.rate, aes(x=year, y=rate.gscaled)) +
  geom_line() +
  geom_point(shape=1, size=2) +
  geom_abline(intercept=0, slope=0, lty=2) +
  geom_abline(intercept=1.96, slope=0, lty=2) +
  geom_abline(intercept=-1.96, slope=0, lty=2) +
  ylim(-2.2, 2.2) +
  ggtitle("基準化された増減率") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-19

漢字問題データ

大学生は漢字が苦手

グループ	検定の合格率
中学1年生	78.5%
高校1年生	59.0%
大学1年生	39.8%
上場企業新入社員	60.7%

kanji <- data.frame(
  group=factor(c("中学1","高校1","大学1","新入社員"),
               levels=c("中学1","高校1","大学1","新入社員")),
  rate=c(78.5, 59.0, 39.8, 60.7))

ggplot(kanji, aes(x=group, y=rate)) +
  geom_bar(col="black",fill="grey",stat="identity") +
  ylim(0,100) +
  xlab("グループ") +
  ylab("検定の合格率") +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-21

インフルエンザワクチンの有効率データ

インフルエンザワクチン：昨冬の効果有効率27%

flu <- data.frame(
  year=c(13, 14, 15, 16),
  vac=c(0.3, 1.9, 1.7, 4.8),
  n.vac=c(1.2, 5.9, 2.4, 6.6))

flu <- transform(flu, eff=round( (n.vac-vac)/n.vac*100, 1))
flu

##   year vac n.vac  eff
## 1   13 0.3   1.2 75.0
## 2   14 1.9   5.9 67.8
## 3   15 1.7   2.4 29.2
## 4   16 4.8   6.6 27.3

ggplot(flu, aes(x=year, y=eff)) +
  geom_line() +
  geom_point() +
  xlab("有効率") +
  ylab("") +
  ylim(0,80) +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-23

ggplot(flu, aes(x=year, y=eff)) +
  geom_bar(col="black",fill="grey",stat="identity") +
  xlab("有効率") +
  ylab("") +
  ylim(0,80) +
  theme_bw(base_family="Osaka")

plot of chunk unnamed-chunk-24

flu2 <- data.frame(
  vac=c(14364-696, 696, 14364),
  n.vac=c(3101-206, 206, 3101))
row.names(flu2) <- c("fine","sick","sum")
flu2

##        vac n.vac
## fine 13668  2895
## sick   696   206
## sum  14364  3101

flu2.ratio <- flu2["sick",] / flu2["sum",] * 100
row.names(flu2.ratio) <- c("ratio")
flu2.ratio

##         vac n.vac
## ratio 4.845 6.643

prop.test(as.numeric(flu2["sick",]), 
          as.numeric(flu2["sum",]))

## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  as.numeric(flu2["sick", ]) out of as.numeric(flu2["sum", ])
## X-squared = 16.46, df = 1, p-value = 4.967e-05
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.027614 -0.008337
## sample estimates:
##  prop 1  prop 2 
## 0.04845 0.06643

prop.test(as.numeric(flu2["sick",]), 
          as.numeric(flu2["sum",]),
          p=c(0.05, 0.05))

## 
##  2-sample test for given proportions with continuity correction
## 
## data:  as.numeric(flu2["sick", ]) out of as.numeric(flu2["sum", ]), null probabilities c(0.05, 0.05)
## X-squared = 17.97, df = 2, p-value = 0.0001253
## alternative hypothesis: two.sided
## null values:
## prop 1 prop 2 
##   0.05   0.05 
## sample estimates:
##  prop 1  prop 2 
## 0.04845 0.06643