Ｒで学ぶ確立統計

options(encoding = "utf-8")
b01 <- read.csv("A02L01.csv")
b02 <- table(b01$study,b01$result)
b02

##                 
##                  合格 不合格
##   勉強した        100     50
##   勉強しなかった   25     25

第１回　Ｒの基本操作

##　2-3 データフレーム

h <- c(148,152,154,158)
w <- c(52,54,56,58)
D <- data.frame(h,w)
D

##     h  w
## 1 148 52
## 2 152 54
## 3 154 56
## 4 158 58

names(D)

## [1] "h" "w"

## 3-1 パッケージの利用(tidyverse)

# install.packages("tidyverse")
# install.packages("knitr")
 library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

## 3-2 tibble (データフレームの改良版：tidyverse)

h <- c(148,152,154,158)
w <- c(52,54,56,58)
d <- tibble(h,w)
d

## # A tibble: 4 × 2
##       h     w
##   <dbl> <dbl>
## 1   148    52
## 2   152    54
## 3   154    56
## 4   158    58

## 3-4 ファイルの読み込み(tidyverse)(1)

library(tidyverse)
a01 <- read_table2("A01L01.txt")

## Warning: `read_table2()` was deprecated in readr 2.0.0.
## ℹ Please use `read_table()` instead.

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   name1 = col_character(),
##   A = col_double(),
##   B = col_double()
## )

a02 <- read_csv("A01L02.csv")

## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): C, D
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

a03 <- read_tsv("A01L03.tsv")

## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): name1
## dbl (2): height, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

## 3-4 ファイルの読み込み(tidyverse)(2)

# ?read.csv
# ?read_csv

第２回　データの要約とレポート作成

## 1-5 表の作成

b01 <- read_csv("A02L01.csv")

## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

b02 <- table(b01$study,b01$result)
# b02 <- table(b01[,1:2])
prop.table(b02)

##                 
##                   合格 不合格
##   勉強した       0.500  0.250
##   勉強しなかった 0.125  0.125

prop.table(b02,1)

##                 
##                       合格    不合格
##   勉強した       0.6666667 0.3333333
##   勉強しなかった 0.5000000 0.5000000

prop.table(b02,2)

##                 
##                       合格    不合格
##   勉強した       0.8000000 0.6666667
##   勉強しなかった 0.2000000 0.3333333

addmargins(b02)

##                 
##                  合格 不合格 Sum
##   勉強した        100     50 150
##   勉強しなかった   25     25  50
##   Sum             125     75 200

addmargins(b02,1)

##                 
##                  合格 不合格
##   勉強した        100     50
##   勉強しなかった   25     25
##   Sum             125     75

addmargins(b02,2)

##                 
##                  合格 不合格 Sum
##   勉強した        100     50 150
##   勉強しなかった   25     25  50

table(b01[,1:2]) %>% prop.table() %>% addmargins()

##                 result
## study             合格 不合格   Sum
##   勉強した       0.500  0.250 0.750
##   勉強しなかった 0.125  0.125 0.250
##   Sum            0.625  0.375 1.000

## 2-1 グラフの作成(2)

library(ggplot2)

# ggplot() +
#   geom_point()
#   geom_line()
#   geom_bar()
#   geom_histogram()
#   geom_box()
#   stat_smooth()
#   theme()


## 2-2 散布図、折れ線グラフ(1)

a01 <- read_csv("A02L02.csv")

## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): A, B
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(a01,aes(x=A,y=B)) +
    geom_point()

a01

## # A tibble: 5 × 3
##   name1     A     B
##   <chr> <dbl> <dbl>
## 1 A1      148    52
## 2 A2      152    54
## 3 A3      154    56
## 4 A4      158    58
## 5 A5      163    60

## 2-2 散布図、折れ線グラフ(2)

ggplot(a01,aes(x=A,y=B)) +
 geom_line() +
 labs(x="hight",y="weight") +
 theme(text=element_text(size=16))

## 2-2 散布図、折れ線グラフ(3)

ggplot(a01,aes(x=A,y=B)) +
 geom_point() + geom_line() +
 theme(text=element_text(size=16))

## 2-3 棒グラフ、積み上げ棒グラフ、円グラフ(1)

c01 <- read_csv("A02L03.csv")

## Rows: 7 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): A
## dbl (2): freq, pos1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

 ggplot(c01,aes(x=A,y=freq)) +
 geom_bar(stat="identity")

 ggplot(b01,aes(x=study)) +
 geom_bar()

## 2-3 棒グラフ、積み上げ棒グラフ、円グラフ(2)

p1 <- ggplot(c01,aes(x="",y=freq,fill=A)) +
 geom_bar(stat="identity",position="fill")
 geom_text(aes(label=freq,y=pos1))

## mapping: y = ~pos1, label = ~freq 
## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity

p1

c01[,3] <- 1-(cumsum(c01$freq)-(0.5*c01$freq))


## 2-3 棒グラフ、積み上げ棒グラフ、円グラフ(3)

p2 <- p1 +
 coord_polar("y",direction=-1) +
 theme_void()
p2

## 2-4 ヒストグラム、密度グラフ

c02 <- read_csv("A02L04.csv")

## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

 ggplot(c02,aes(x=subjA,y=..density..)) +
 geom_histogram() +
 geom_density()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

第３回　記述統計と確立

## 1-4 集団の特徴を表現する量(3)

library(ggplot2)
d01 <- read.csv("A03L01.csv")
ggplot(d01) +
 geom_boxplot(aes(x=subject,y=score))

## 1-4 集団の特徴を表現する量(6)

d02 <- read.csv("A03L02.csv")
var(d02)

##          subjA     subjB
## subjA 89.42424  18.18788
## subjB 18.18788 138.19960

cor(d02)

##           subjA     subjB
## subjA 1.0000000 0.1636067
## subjB 0.1636067 1.0000000

第４回　確率分布①

## 3-3 Ｒによるシミュリーション
## 離散型
library(tidyverse)

f1 <- tibble(x=0:10,y=dgeom(seq(0,10,1),0.2))
ggplot(f1) + geom_bar(aes(x=x,y=y),stat="identity")

## 連続型
ggplot(data=data.frame(x=c(-1,1,5)),aes(x=x)) + 
 stat_function(fun=dexp,args=c(rate=1))

ggplot(data.frame(x=rnorm(1000))) + 
 geom_histogram(aes(x=x,y=..density..)) +
 stat_function(fun=dnorm,color="red")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

第５回　確率分布②

## 3-1 Ｒにおける分布関数と密度関数の描画

  ggplot(data=data.frame(x=c(-1,10)),aes(x=x)) + 
 stat_function(fun=df,args=list(df1=30,df2=20))

## 3-2 大数の弱法則

n <- 50000
p <- 0.3
g1 <- tibble(count=1:n,trial=rbinom(n,1,p),
   success=cumsum(trial),prob=success/count)
ggplot(g1,aes(x=count,y=prob))+ylim(0,1)+
  geom_hline(yintercept=p, colour="red")+geom_line()

## ③一3中心極限定理

n <- 1000
m <- 10000

a <- -10;b <- 10
g2 <- numeric(m)
for(i in 1:m){
 g2[i] <- (mean(runif(n,a,b))-(a+b)/2)/(sqrt((b-a)^2/12)/sqrt(n))
}
g3 <- tibble(x=g2)
ggplot(g3)+geom_histogram(aes(x=x,y=..density..),bins=50)

##  ③一χ2分布
n <- 3
m <- 1000
#
g4 <- numeric(m)
for(i in 1:m){
g4[i] <- (sum(rnorm(n,0,1)^2))
}
g5 <- tibble(x=g4)
ggplot(g5)+geom_histogram(aes(x=x,y=..density..),bins=50)+
stat_function(fun=dchisq,args=c(df=n))

##  ③一5  t分布
a <- 10;b <- 5;
n <- 20;m <- 1000
g6 <- numeric(n)
g7 <- numeric(m)
for(i in 1:m){
 g6 <- rnorm(n,a,b);barx <- mean(g6)
 g7[i] <- sum((g6-a)/(b*sqrt(n)))/sqrt(sum((g6-barx)^2/b^2)/(n-1))
}
g8 <- tibble(x=g7)
ggplot(g8)+geom_histogram(aes(x=x,y=..density..),bins=50)+
stat_function(fun=dt,args=c(df=(n-1)))

##  ③一6  F分布
m <- 1000
n1 <- 60;n2 <- 70;
a1 <- 10;b1<-10;a2 <- 30;b2 <- 40
ge <- numeric(m)
for (i in 1:m){
ga <- rnorm(n1,a1,b1);gb <- rnorm(n2,a2,b2)
gc <- sum((ga-mean(ga))^2/(b1^2));gd <- sum((gb-mean(gb))^2/(b2^2))
ge[i] <- (gc/(n1-1))/(gd/(n2-1))
}
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+
stat_function(fun=df,args=c(df1=(n1-1),df2=(n2-1)))

第６回　統計的推定

##  ③-1  区間推定

prob <- 0.3
n <- 1001
m <- 100
alpha <- 0.05
est <- numeric(m)
for(j in 1:m){
 est[j] <- sum(rbinom(n,1,prob)/n)
}
z <- tibble(count=1:m,est=est,
 lower=est - qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
 upper=est + qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
check=ifelse(abs(prob-est) < qnorm(1-alpha/2,0,1)*
  sqrt(est*(1-est)/n),"Y","N"))

##  グラフ
ggplot(z)+geom_point(aes(x=count,y=est))+
 geom_segment(aes(x=count,y=lower,xend=count,yend=upper,colour=check))+
  geom_hline(yintercept=prob,colour="red")

第７回　統計的仮説検定①

## ①－４　片側検定と両側検定(2)
## 二項分布を正規分布で近似

prob <- 0.5;n <- 500;u0 <- n*prob;s0 <- sqrt(n*prob*(1-prob))
qnorm(0.025,mean=u0,sd=s0)  #228.0869

## [1] 228.0869

qnorm(0.05,mean=u0,sd=s0)  #231.61

## [1] 231.61

qnorm(0.95,mean=u0,sd=s0)  #268.39

## [1] 268.39

qnorm(0.975,mean=u0,sd=s0)  #271.9131

## [1] 271.9131

## ①－４　片側検定と両側検定(3)

i3 <- tibble(x=200:300,y=dbinom(200:300,n,prob),z=c(rep("Y",69),rep("N",32)))
ggplot(i3)+geom_bar(aes(x=x,y=y,fill=z),stat="identity")+
    stat_function(fun=dnorm,args=c(mean=u0,sd=s0),color="red")+
    theme(legend.position="none")+
    scale_fill_manual(values=c("red","black"))

 group1 <- c(0.7,-1.6,-0.2,-1.2,-0.1,3.4,3.7,0.8,0.0,2.0)    # 睡眠時間の増加
 t.test(group1, mu=0)

## 
##  One Sample t-test
## 
## data:  group1
## t = 1.3257, df = 9, p-value = 0.2176
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.5297804  2.0297804
## sample estimates:
## mean of x 
##      0.75

## ③－１　１標本の母平均の検定

x <- c(97.7,94.7,91.6,98.0,102.9,94.5,99.1,95.3,93.8,102.0)
z1 <- sum(x-100)/(sqrt(10)*5)
t1 <- (mean(x)-100)/(sqrt(var(x))/sqrt(10))
y1 <- dt(t1,9)
p <- 2*pt(t1,9)
ggplot(data=data.frame(x=c(-4,4)),aes(x=x))+
stat_function(fun=dt,args=c(df=9))+
geom_segment(x=t1,y=0,xend=t1,yend=y1,colour="red")+
geom_segment(x=-t1,y=0,xend=-t1,yend=y1,colour="red")

t.test(x, mu=100)

## 
##  One Sample t-test
## 
## data:  x
## t = -2.638, df = 9, p-value = 0.027
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
##  94.35307 99.56693
## sample estimates:
## mean of x 
##     96.96

## ③－２　２標本の母平均の差の検定

subX <- c(66,73,65,64,59,64,66,63,64,74)
subY <- c(61,64,69,65,70,76,74,82,79,72)
var.test(subX,subY)

## 
##  F test to compare two variances
## 
## data:  subX and subY
## F = 0.44824, num df = 9, denom df = 9, p-value = 0.2477
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.111337 1.804620
## sample estimates:
## ratio of variances 
##          0.4482422

t.test(subX,subY,paired=FALSE,var.equal=FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  subX and subY
## t = -2.1034, df = 15.718, p-value = 0.05191
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.85039961   0.05039961
## sample estimates:
## mean of x mean of y 
##      65.8      71.2

t.test(subX,subY,paired=FALSE,var.equal=TRUE)

## 
##  Two Sample t-test
## 
## data:  subX and subY
## t = -2.1034, df = 18, p-value = 0.04976
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.793730633  -0.006269367
## sample estimates:
## mean of x mean of y 
##      65.8      71.2

t.test(subX,subY,paired=TRUE)

## 
##  Paired t-test
## 
## data:  subX and subY
## t = -1.8701, df = 9, p-value = 0.09428
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  -11.932026   1.132026
## sample estimates:
## mean difference 
##            -5.4

第８回　統計的仮説検定②

## ①－３　Ｒによるシミュレーション(1)

x <- c(58,59,61,64,67,67,67,67,68,68,70,71,72,73)
y <- c(54,56,56,57,58,59,59,60,61,62,63,66,68,70,70)
z <- c(52,53,56,59,60,60,61,62,64,65,65,66,68,70,75,76)
na <- length(x);nb <- length(y);nc <- length(z)
h1 <- tibble(score=c(x,y,z),class1=c(rep("A",na),rep("B",nb),rep("C",nc)))
h1 <- h1 %>% mutate(class2=
ifelse(score<68,ifelse(score<61,"L","M"),"H")) %>%
mutate(class2=factor(class2,levels=c("L","M","H")))
h3 <- aov(score~class1,data=h1); summary(h3)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## class1       2  207.2  103.61   3.196  0.051 .
## Residuals   42 1361.4   32.41                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

h4 <- aov(score~class2,data=h1); summary(h4)

##             Df Sum Sq Mean Sq F value Pr(>F)    
## class2       2 1305.8   652.9   104.4 <2e-16 ***
## Residuals   42  262.8     6.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

oneway.test(score~class1,data=h1,var.equal=TRUE)

## 
##  One-way analysis of means
## 
## data:  score and class1
## F = 3.1965, num df = 2, denom df = 42, p-value = 0.05103

oneway.test(score~class1,data=h1)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  score and class1
## F = 4.2777, num df = 2.000, denom df = 27.848, p-value = 0.02399

## ②－３　Ｒによるシミュレーション

p1 <- c(0.18,0.19,0.2,0.21,0.22)
p2 <- c(0.2,0.2,0.2,0.2,0.2)
n <- 100
i1 <- rmultinom(1,n,p1)
chisq.test(i1,p=p2)

## 
##  Chi-squared test for given probabilities
## 
## data:  i1
## X-squared = 0.8, df = 4, p-value = 0.9384

b01 <- read_csv("A02L01.csv")

## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

b02 <- table(b01$study,b01$result)
summary(b02)

## Number of cases in table: 200 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 4.444, df = 1, p-value = 0.03501

chisq.test(b02)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  b02
## X-squared = 3.7618, df = 1, p-value = 0.05244

chisq.test(b02,correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  b02
## X-squared = 4.4444, df = 1, p-value = 0.03501

## ③－２　相関の検定

n <- 5;alpha <- 0.05
x <- c(1.275,0.592,-1.714,-1.888,1.641)
y <- c(-0.112,-0.212,0.575,0.088,0.890)
r <- cor(x,y)
t <- r*sqrt(n-2)/sqrt(1-r^2)
z <- 1/2*log((1+r)/(1-r))
zL<- z-qnorm(1-alpha/2,0,1)/sqrt(n-3)
zU <- z+qnorm(1-alpha/2,0,1)/sqrt(n-3)
rhoL <- (exp(2*zL)-1)/(exp(2*zL)+1)
rhoU <- (exp(2*zU)-1)/(exp(2*zU)+1)
cor.test(x,y)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 0.035026, df = 3, p-value = 0.9743
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.8777047  0.8866683
## sample estimates:
##        cor 
## 0.02021788

Ｒで学ぶ確立統計

放送大学

2021/12/5