#1-3-4
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
# library(readr)
# でもよい
a01 <- read_table2("A01L01.txt")
## Warning: `read_table2()` was deprecated in readr 2.0.0.
## ℹ Please use `read_table()` instead.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## name1 = col_character(),
## A = col_double(),
## B = col_double()
## )
a02 <- read_csv("A01L02.csv")
## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): C, D
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
a03 <- read_tsv("A01L03.tsv")
## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): name1
## dbl (2): height, weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#2-1-5
# クロス集計
library(readr)
# library(tidyverse) でも良い。その場合も下は必要
# %>% を使うため
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
#
b01 <- read_csv("A02L01.csv")
## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#
# もし文字化けする場合に文字コードを指定するには
#
#b01 <- read_csv("A02L01.csv",locale=locale(encoding="UTF-8"))
#b01 <- read_csv("A02L01.csv",locale=locale(encoding="CP932"))
#
b02 <- table( b01$study, b01$result)
prop.table( b02 )
##
## 合格 不合格
## 勉強した 0.500 0.250
## 勉強しなかった 0.125 0.125
prop.table( b02, 1)
##
## 合格 不合格
## 勉強した 0.6666667 0.3333333
## 勉強しなかった 0.5000000 0.5000000
prop.table( b02, 2)
##
## 合格 不合格
## 勉強した 0.8000000 0.6666667
## 勉強しなかった 0.2000000 0.3333333
addmargins(b02 )
##
## 合格 不合格 Sum
## 勉強した 100 50 150
## 勉強しなかった 25 25 50
## Sum 125 75 200
addmargins(b02, 1)
##
## 合格 不合格
## 勉強した 100 50
## 勉強しなかった 25 25
## Sum 125 75
addmargins(b02, 2)
##
## 合格 不合格 Sum
## 勉強した 100 50 150
## 勉強しなかった 25 25 50
table(b01[,1:2] ) %>% prop.table() %>% addmargins()
## result
## study 合格 不合格 Sum
## 勉強した 0.500 0.250 0.750
## 勉強しなかった 0.125 0.125 0.250
## Sum 0.625 0.375 1.000
#2-2-2 (1)
library(ggplot2)
a01 <- read_csv("A02L02.csv")
## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): A, B
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(a01, aes(x=A, y=B) ) +
geom_point()

#ggplot(a01, aes(x=A, y=B) ) +
# geom_point()+theme_bw()
#2-2-2 (2)
library(tidyverse)
library(ggplot2)
ggplot(a01, aes(x=A, y=B)) +
geom_line() +
labs(x= "height", y="weight") +
theme(text = element_text(size=16) )

# 2-2-2 (3)
library(tidyverse)
# a01 <- read_csv("A02L02.csv")
# 上をすでに行っている前提で
ggplot(a01,aes(x=A, y=B) )+
geom_point()+geom_line()+
theme(text = element_text(size=16) )

# size=16 は文字のサイズ
# 2-2-3
c01 <- read_csv("A02L03.csv")
## Rows: 7 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): A
## dbl (2): freq, pos1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(c01,aes(x=A, y= freq ) ) +
geom_bar(stat="identity")

#
#2-2-3 R
#ggplot(b01,aes(x=study))+geom_bar()# 2-2-3 (2)
library(tidyverse)
p1 <- ggplot(c01,aes(x="",y= freq, fill=A) ) +
geom_bar(stat="identity", position="fill") +
geom_text(aes(label=freq, y=pos1))
p1

# 2-2-3 (3)
# library(tidyverse)
#p1 <- ggplot(c01,aes(x="",y= freq, fill=A) ) +
# geom_bar(stat="identity", position="fill") +
# geom_text(aes(label=freq, y=pos1))
# これがすでに行われている前提で
p2 <- p1 +
coord_polar("y", direction=-1) +
theme_void()
p2

# 2-2-4
#library(tidyverse)
c02 <- read_csv("A02L04.csv")
## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(c02,aes(x= subjA, y= ..density..,)) +
geom_histogram() + geom_density()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 赤いメッセージは binの幅をを変更した方が良いというメッセージ
#ggplot(c02,aes(x= subjA, y= ..density..,)) +
# geom_histogram(bins=20) + geom_density()
#3-1-4
library(tidyverse)
d01 <- read_csv("A03L01.csv")
## Rows: 200 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): subject
## dbl (1): score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d02 <- read_csv("A03L02.csv")
## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(d01)+geom_boxplot(aes(x=subject,y=score))

summary(d02)
## subjA subjB
## Min. :39.0 Min. :17.00
## 1st Qu.:53.0 1st Qu.:54.00
## Median :61.5 Median :62.00
## Mean :60.3 Mean :61.68
## 3rd Qu.:67.0 3rd Qu.:69.00
## Max. :82.0 Max. :99.00
cor(d02)
## subjA subjB
## subjA 1.0000000 0.1636067
## subjB 0.1636067 1.0000000
var(d02)
## subjA subjB
## subjA 89.42424 18.18788
## subjB 18.18788 138.19960
library(tidyverse)
dbinom(3,10,0.1)
## [1] 0.05739563
# ?dgeom(0,0.2)
f1 <- tibble(x=0:100, y=dbinom(seq(0,100,1),100,0.2))
ggplot(f1) +geom_bar(aes(x=x,y=y),stat="identity")

#
f2 <- tibble(x=0:1000,y=dbinom(seq(0,1000,1),1000,0.2))
ggplot(f2)+geom_bar(aes(x=x,y=y),stat="identity")+
scale_x_continuous(limits=c(150,250))
## Warning: Removed 900 rows containing missing values (`position_stack()`).
## Warning: Removed 2 rows containing missing values (`geom_bar()`).

#4-3-3
ggplot(data=data.frame(x=c(-1,1.5)), aes(x=x)) +
stat_function(fun=dexp, args=c(rate=1) )

#ggplot(data=data.frame(x=c(-0.5,4.5)), aes(x=x)) +
# stat_function(fun=dexp, args=c(rate=2) )
#
#ggplot(data=data.frame(x=c(-1,15)), aes(x=x)) +
# stat_function(fun=dgamma, args=c(shape=4,rate=2) )
#
#ggplot( data.frame(x = rnorm(1000) ) )+
# geom_histogram(aes(x=x,y=..density..)) +
# stat_function(fun = dnorm, colour = "red")
#4-3-3
library(tidyverse)
ggplot(data=data.frame(x=c(-1,15)), aes(x=x)) +
stat_function(fun=dgamma, args=c(shape=4,rate=2) )

library(tidyverse)
n <- 20
alpha <- 0.05
calpha1 <- qchisq(alpha/2,n)
calpha2 <- qchisq(1-alpha/2,n)
y1 <- dchisq(calpha1,n)
y2 <- dchisq(calpha2,n)
ggplot(data=data.frame(x=c(-1,100)),aes(x=x) )+
stat_function(fun=dchisq,args=c(df=n) )+geom_segment(x=calpha1,y=0,xend=calpha1,yend=y1,color="red")+geom_segment(x=calpha2,y=0,xend=calpha2,yend=y2,color="red")

library(tidyverse)
n <- 30
alpha <- 0.05
talpha1 <- qt(alpha/2,n)
talpha2 <- qt(1-alpha/2,n)
y1 <- dt(talpha1,n)
y2 <- dt(talpha2,n)
ggplot(data=data.frame(x=c(-5,5)),aes(x=x) )+
stat_function(fun=dt,args=c(df=n) )+
geom_segment(x=talpha1,y=0,xend=talpha1,yend=y1,color="red")+
geom_segment(x=talpha2,y=0,xend=talpha2,yend=y2,color="red")

library(tidyverse)
m <- 15
n <- 30
falpha1 <- qf(0.025,m,n) #1/qf(0.975,n,m) (nとmの入れ替え) 動画は0.95 ですが...
falpha2 <- qf(0.975,m,n)
y1 <- df(falpha1,m,n)
y2 <- df(falpha2,m,n)
ggplot(data=data.frame(x=c(-1,5)),aes(x=x) )+
stat_function(fun=df,args=c(df1=m,df2=n) )+
geom_segment(x=falpha1,y=0,xend=falpha1,yend=y1,color="red")+
geom_segment(x=falpha2,y=0,xend=falpha2,yend=y2,color="red")

# 5-3-2
library(tidyverse)
n <- 50000
p <- 0.3
bt <- tibble(count=1:n,trial=rbinom(n,1,p),success=cumsum(trial),prob=success/count)
ggplot(bt,aes(x=count,y=prob))+
ylim(0,1)+geom_hline(yintercept=p, colour="red")+geom_line()

# ylim はy軸の範囲
ggplot(bt,aes(x=count,y=prob))+
ylim(0,0.5)+geom_hline(yintercept=p, colour="red")+geom_line()

# 5-3-3
library(tidyverse)
#各回の乱数の個数
n <- 1000
# ヒストグラムを書くための繰り返しの数
m <- 10000
# 一様乱数のためのパラメータ
a <- -10
b <- 10
#試しに一回
g2 <- tibble(x=runif(n,a,b))
ggplot(g2)+geom_histogram(aes(x=x,y=..density..),bins=50)

#
g3 <- numeric(m)
for(i in 1:m){
g3[i] <- ( mean(runif(n,a,b)) -(a+b)/2 ) / ( sqrt((b-a)^2/12) /sqrt(n) )
}
g4 <- tibble(x=g3)
ggplot(g4)+geom_histogram(aes(x=x,y=..density..),bins=50)

library(tidyverse)
#
n <- 3
m <- 1000
# 5-3-4
g4 <- numeric(m)
for(i in 1:m){
g4[i] <- ( sum(rnorm(n,0,1)^2) )
}
g5 <- tibble(x=g4)
ggplot(g5)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dchisq, args=c(df=n) )

#ggplot(data=data.frame(x=c(-1,10)), aes(x=x)) +
# stat_function(fun=dchisq, args=c(df=3))
library(tidyverse)
library(tidyverse)
# 平均a 標準偏差が b の乱数を n個作る
n <- 20
a <- 10
b <- 5
m <- 1000
# 配列の変数を定義するためのもの
g6 <- numeric(n)
g7 <- numeric(m)
#
for(i in 1:m){
g6 <- rnorm(n,a,b)
barx <- mean(g6)
g7[i] <- sum( (g6-a)/(b*sqrt(n)) )/ sqrt( sum((g6-barx)^2/b^2 ) / (n-1) )
}
g8 <- tibble(x=g7)
ggplot(g8)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dt, args=c(df=(n-1)) )

library(tidyverse)
m <- 1000
n1 <- 16; a1 <- 10; b1 <- 10
n2 <- 30; a2 <- 30; b2 <- 40
ge <- numeric(m)
for(i in 1:m){
ga <-rnorm(n1,a1,b1)
gb <-rnorm(n2,a2,b2)
gc <- sum( (ga-mean(ga))^2 / (b1^2))
gd <- sum( (gb-mean(gb))^2 / (b2^2) )
# n1+ n2個から2つできる
# そのgcとgd をgeに入れていく
ge[i] <- (gc/ (n1-1) ) / (gd / (n2-1) )
}
# ge としてはm個できる。これをヒストグラムにしたものとF分布の形を比較
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=df, args=c(df1=(n1-1),df2=(n2-1)))

# 5-3-1
ggplot(data=data.frame(x=c(-1,5)), aes(x=x)) +
stat_function(fun=df, args=c(df1=30,df2=20) )

#m <-15
#n <- 30
#alpha1 <- qf(0.025,m,n) # これは 1/qf(0.975,n,m) (nとmを入れ替え)
#alpha2 <- qf(0.975,m,n)
#y1 <- df(alpha1,m,n)
#y2 <- df(alpha2,m,n)
#ggplot(data=data.frame(x=c(-1,5)), aes(x=x)) +
# stat_function(fun=df, args=c(df1=m,df2=n) )+geom_segment(x=alpha1,y=0,xend=alpha1,yend=y1, colour="red") + geom_segment(x=alpha2,y=0,xend=alpha2,yend=y2,colour="red") + geom_segment(x=alpha1,y=0,xend=alpha1,yend=y1,colour="red") + geom_segment(x=alpha2,y=0,xend=alpha2,yend=y2,colour="red")
# 5-3-2
n <- 50000
p <- 0.3
bt <- tibble(count=1:n,trial=rbinom(n,1,p),success=cumsum(trial),prob=success/count)
ggplot(bt,aes(x=count,y=prob))+ylim(0,1)+geom_hline(yintercept=p, colour="red")+geom_line()

# 5-3-3
#各回の乱数の個数
n <- 1000
# ヒストグラムを書くための繰り返しの数
m <- 10000
# 一様乱数のためのパラメータ
a <- -10
b <- 10
#試しに一回
g2 <- tibble(x=runif(n,a,b))
ggplot(g2)+geom_histogram(aes(x=x,y=..density..),bins=50)

#
g3 <- numeric(m)
for(i in 1:m){
g3[i] <- ( mean(runif(n,a,b)) -(a+b)/2 ) / ( sqrt((b-a)^2/12) /sqrt(n) )
}
g4 <- tibble(x=g3)
ggplot(g4)+geom_histogram(aes(x=x,y=..density..),bins=50)

#
n <- 3
m <- 1000
# 5-3-4
g4 <- numeric(m)
for(i in 1:m){
g4[i] <- ( sum(rnorm(n,0,1)^2) )
}
g5 <- tibble(x=g4)
ggplot(g5)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dchisq, args=c(df=n) )

#ggplot(data=data.frame(x=c(-1,10)), aes(x=x)) +
# stat_function(fun=dchisq, args=c(df=3))
#5-3-5
#library(tidyverse)
a <- 10;b <- 5
n <- 20;m <- 1000
#
g6 <- numeric(n)
g7 <- numeric(m)
for(i in 1:m){
g6 <- rnorm(n,a,b)
barx <- mean(g6)
g7[i] <- sum( (g6-a)/(b*sqrt(n)) )/ sqrt( sum((g6-barx)^2/b^2 ) / (n-1) )
}
g8 <- tibble(x=g7)
ggplot(g8)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dt, args=c(df=(n-1)) )

# 5-3-6
#library(tidyverse)
m <- 1000
n1 <- 60; n2 <- 70
a1 <- 10; b1 <- 10; a2 <- 30;b2 <- 40
ge <- numeric(m)
for(i in 1:m){
# 平均a1,標準偏差b1 の正規分布に従う乱数をn1個
# 平均a2,標準偏差b2 の正規分布に従う乱数をn2個
ga <-rnorm(n1,a1,b1)
gb <-rnorm(n2,a2,b2)
# そこからgc とgd を作る
gc <- sum( (ga-mean(ga))^2 / (b1^2))
gd <- sum( (gb-mean(gb))^2 / (b2^2) )
# n1+ n2個から2つできる
# そのgcとgd をgeに入れていく
ge[i] <- (gc/ (n1-1) ) / (gd / (n2-1) )
}
# ge としてはm個できる。これをヒストグラムにしたものとF分布の形を比較
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=df, args=c(df1=(n1-1),df2=(n2-1)))

# 5-3-6
#library(tidyverse)
m <- 1000
n1 <- 6; a1 <- 10;b1 <- 10
n2 <- 7; a2 <- 30;b2 <- 40
ge <- numeric(m)
for(i in 1:m){
ga <-rnorm(n1,a1,b1)
gb <-rnorm(n2,a2,b2)
gc <- sum( (ga-mean(ga))^2 / (b1^2))
gd <- sum( (gb-mean(gb))^2 / (b2^2) )
ge[i] <- (gc/ (n1-1) ) / (gd / (n2-1) )
}
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=df, args=c(df1=(n1-1),df2=(n2-1)))

# 6-2-7
# Macで文字が□となる場合以下をコメントアウト
#theme_set(theme_minimal(base_size=16,base_family="sans") )
#
alpha <- 0.05
p1 <- 1-alpha/2
x1 <- qnorm(alpha/2,0,1)
y1 <- dnorm(x1,0,1)
x2 <- qnorm(1-alpha/2,0,1)
y2 <- dnorm(x2,0,1)
gg <- ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
stat_function(fun=dnorm, args=c(mean=0,sd=1) )+geom_segment(x=x1,y=0,xend=x1,yend=y1,colour="red")+geom_segment(x=x2,y=0,xend=x2,yend=y2,colour="red")+annotate("text",x=0,y=0.3,label=expression(paste(over(alpha,2))))
gg
## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

#plot(data=data.frame(x=c(-4,4)), aes(x=x)) +
#stat_function(fun=dnorm, args=c(mean=0,sd=1) )+geom_segment(x=x1,y=0,xend=x1,yend=y1,colour="red")+geom_segment(x=x2,y=0,xend=x2,yend=y2,colour="red")+annotate("text",x=0,y=0.3,label=expression(paste(over(alpha,2))))+annotate("text",x=2,y=0.3,label=expression(paste(over(alpha,2))))
# 6-3-1
library(tidyverse)
prob <- 0.3
#1回の標本のサンプル数(割り切れないほうが良い)
n <- 1001
#実験の回数
m <- 100
#信頼係数
alpha<- 0.05
#estimate(推定量)
est <- numeric(m)
for(j in 1:m) {
est[j] <- sum(rbinom(n,1,prob)/n)
}
#結果をtibbleで表の形にする
z <- tibble(count=1:m,est=est,
lower=est-qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
upper=est+qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
check= ifelse( abs(prob-est) < qnorm(1-alpha/2,0,1)*
sqrt(est*(1-est)/n),"Y","N") )
#グラフ
ggplot(z)+geom_point(aes(x=count,y=est))+
geom_segment(aes(x=count,y=lower,xend=count,yend=upper,colour=check))+
geom_hline(yintercept=prob,colour="red")+ylim(0.1,0.5)

#t分布のグラフ 7-2-3
ggplot(data=data.frame(x=c(-5,5)), aes(x=x)) +
stat_function(fun=dt, args=c(df=2),linetype="solid")+
stat_function(fun=dt, args=c(df=4),linetype="dashed")+
stat_function(fun=dt, args=c(df=10),linetype="longdash")+
stat_function(fun=dnorm,colour="red")

#7-1-4
# スライドのグラフ
library(tidyverse)
prob <- 0.5
n <- 500
u0 <- n*prob
s0 <- sqrt(n*prob*(1-prob))
qnorm(0.025,mean=u0,sd=s0)
## [1] 228.0869
i1 <- tibble(x=200:300,y=dbinom(200:300,n,prob),z=c(rep("N",29),rep("Y",43),rep("N",29)))
ggplot(i1)+geom_bar(aes(x=x,y=y,fill=z),stat="identity")+stat_function(fun=dnorm,args=c(mean=u0,sd=s0),color="red") + theme(legend.position = 'none')+scale_fill_manual(values = c("red","black"))

#7-1-4
library(tidyverse)
prob <- 0.5
n <- 500
u0 <- n*prob
s0 <- sqrt(n*prob*(1-prob))
qnorm(0.95,mean=u0,sd=s0)
## [1] 268.39
i3 <- tibble(x=200:300,y=dbinom(200:300,n,prob),z=c(rep("Y",69),rep("N",32)))
ggplot(i3)+geom_bar(aes(x=x,y=y,fill=z),stat="identity")+stat_function(fun=dnorm,args=c(mean=u0,sd=s0),color="red") + theme(legend.position = 'none')+scale_fill_manual(values = c("red","black"))

# 7-2-1
# スライドの演習
x <- c(97.7,94.7,91.6,98.0,102.9,94.5,99.1,95.3,93.8,102.0)
z1 <- sum(x-100)/(sqrt(10)*5)
t1 <- (mean(x)-100)/(sqrt(var(x))/sqrt(10))
y1 <- dt(t1,9)
p<- 2*pt(t1,9)
# 7-2-2
#library(tidyverse)
#ggplot(data=data.frame(x=c(-5,5)), aes(x=x)) +
# stat_function(fun=dt, args=c(df=2),linetype="solid")+
# stat_function(fun=dt, args=c(df=4),linetype="dashed")+
# stat_function(fun=dt, args=c(df=10),linetype="longdash")+
# stat_function(fun=dnorm,colour="red")
# 7-3-1
library(tidyverse)
x <- c(97.7,94.7,91.6,98.0,102.9,94.5,99.1,95.3,93.8,102.0)
z1 <- sum(x-100)/(sqrt(10)*5)
t1 <- (mean(x)-100)/(sqrt(var(x))/sqrt(10))
y1 <- dt(t1,9)
ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
stat_function(fun=dt,args=c(df=9))+
geom_segment(x=t1,y=0,xend=t1,yend=y1,colour="red")+
geom_segment(x=-t1,y=0,xend=-t1,yend=y1,colour="red")

p<- 2*pt(t1,9)
t.test(x,mu=100,alternative="two.sided")
##
## One Sample t-test
##
## data: x
## t = -2.638, df = 9, p-value = 0.027
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
## 94.35307 99.56693
## sample estimates:
## mean of x
## 96.96
#t.test(x,mu=100,alternative="greater")
#t.test(x,mu=100,alternative="two.sided")
# 7-3-2
library(tidyverse)
subX <- c(66,73,65,64,59,64,66,63,64,74)
subY <- c(61,64,69,65,70,76,74,82,79,72)
ex1 <- tibble(subX=subX,subY=subY)
ex2 <- ex1 %>% pivot_longer(subX:subY,names_to="sub",values_to="score")
ggplot(ex2)+geom_boxplot(aes(x=sub,y=score))

var.test(ex1$subX,ex1$subY)
##
## F test to compare two variances
##
## data: ex1$subX and ex1$subY
## F = 0.44824, num df = 9, denom df = 9, p-value = 0.2477
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.111337 1.804620
## sample estimates:
## ratio of variances
## 0.4482422
t.test(ex1$subX,ex1$subY,paired=FALSE,var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: ex1$subX and ex1$subY
## t = -2.1034, df = 15.718, p-value = 0.05191
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.85039961 0.05039961
## sample estimates:
## mean of x mean of y
## 65.8 71.2
t.test(ex1$subX,ex1$subY,paired=FALSE,var.equal = TRUE)
##
## Two Sample t-test
##
## data: ex1$subX and ex1$subY
## t = -2.1034, df = 18, p-value = 0.04976
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.793730633 -0.006269367
## sample estimates:
## mean of x mean of y
## 65.8 71.2
# 8-1-3
library( mvtnorm )
## Warning: package 'mvtnorm' was built under R version 4.2.3
n1 <- 14;n2 <- 15;n3 <- 16
#set.seet(3)
#x <- round(rnorm(n1,65,6),0)
#y <- round(rnorm(n2,68,6.5),0)
#z <- round(rnorm(n3,62,5.5),0)
x <- c(58,59,61,64,67,67,67,67,68,68,70,71,72,73)
y <- c(54,56,56,57,58,59,59,60,61,62,63,66,68,70,70)
z <- c(52,53,56,59,60,60,61,62,64,65,65,66,68,70,75,76)
#x <-c(51,56,58 62 62 64 65 66 70 72)
#y <-c(56, 62 64 64 68 72 73 74 74 75)
# 点数と ABCでクラス分け
h1 <- tibble(score=c(x,y,z),class1=c(rep("A",n1),rep("B",n2),rep("C",n3)))
#中身を確認する場合には以下のコマンドを打つ
#summary(h1)
#h1
# 点数と LMHでクラス分け
# group_by はclass1ごとに分けて平均点を計算
h1 <- h1 %>% mutate(class2 = ifelse(score<68,ifelse(score<61,"L","M"),"H") ) %>%
mutate(class2=factor(class2,levels=c("L","M","H"))) %>%
group_by(class1) %>% mutate(mean=mean(score))
ggplot(h1)+geom_point(aes(x=class1,y=score))+geom_line(aes(x=class1,y=score))+
geom_point(aes(x=class1,y=mean),colour="red")+
geom_hline(yintercept=mean(h1$score),colour="blue")

# 検定
h3 <- aov(score~class1,data=h1)
summary(h3)
## Df Sum Sq Mean Sq F value Pr(>F)
## class1 2 207.2 103.61 3.196 0.051 .
## Residuals 42 1361.4 32.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# クラス2 で平均点を計算し直す
h1 <- h1 %>% group_by(class2) %>% mutate(mean=mean(score))
ggplot(h1)+geom_point(aes(x=class2,y=score))+geom_line(aes(x=class2,y=score))+
geom_point(aes(x=class2,y=mean),colour="red")+
geom_hline(yintercept=mean(h1$score),colour="blue")

# 検定
h4 <- aov(score~class2,data=h1)
summary(h4)
## Df Sum Sq Mean Sq F value Pr(>F)
## class2 2 1305.8 652.9 104.4 <2e-16 ***
## Residuals 42 262.8 6.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# h3 と比較
oneway.test(score~class1,data=h1,var.equal=TRUE)
##
## One-way analysis of means
##
## data: score and class1
## F = 3.1965, num df = 2, denom df = 42, p-value = 0.05103
oneway.test(score~class1,data=h1)
##
## One-way analysis of means (not assuming equal variances)
##
## data: score and class1
## F = 4.2777, num df = 2.000, denom df = 27.848, p-value = 0.02399
# 追加(h4と比較する場合)
#oneway.test(score~class2,data=h1,var.equal=TRUE)
#oneway.test(score~class2,data=h1)
# 8-2-2 のスライドのグラフ
library(tidyverse)
n <- 30
x1 <- qchisq(0.95,n)
y1 <- dchisq(x1,n)
ggplot(data=data.frame(x=c(-1,60)), aes(x=x)) +
stat_function(fun=dchisq, args=c(df=n))+
geom_segment(x=x1,y=0,xend=x1,yend=y1)

# 8-2-3 Part1
p1 <- c(0.16,0.18,0.2,0.22,0.24)
p2 <- c(0.2,0.2,0.2,0.2,0.2)
n <- 100
i1 <- rmultinom(1,n,p1)
chisq.test(i1,p=p2)
##
## Chi-squared test for given probabilities
##
## data: i1
## X-squared = 4.7, df = 4, p-value = 0.3195
# 8-2-3 Part2
library(tidyverse)
b01 <- read_csv("A02L01.csv")
## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
b02 <- table( b01$study, b01$result)
summary(b02)
## Number of cases in table: 200
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 4.444, df = 1, p-value = 0.03501
chisq.test(b02)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: b02
## X-squared = 3.7618, df = 1, p-value = 0.05244
chisq.test(b02,correct=FALSE)
##
## Pearson's Chi-squared test
##
## data: b02
## X-squared = 4.4444, df = 1, p-value = 0.03501
# イェーツの補正をするかしないかを明示する
#chisq.test(b02,correct=TRUE)
# 8-3-2
library(tidyverse)
n <- 5;alpha <- 0.05
# 5個のサンプルデータ
x <- c(1.275,0.592,1.714,-1.888,1.641)
y <- c(-0.112,-0.212,0.575,0.088,0.890)
# corで相関を計算
r <- cor(x,y)
t <- r * sqrt( n-2 )/sqrt( 1- r^2 )
z <- 1/2 * log( (1 + r )/ ( 1 - r ))
zL <- z - qnorm(1- alpha/2,0,1) /sqrt(n-3)
zU <- z + qnorm(1- alpha/2,0,1) /sqrt(n-3)
rhoL <- ( exp(2*zL)-1 )/( exp(2*zL)+1 )
rhoU <- ( exp(2*zU)-1 )/( exp(2*zU)+1 )
cor.test(x,y)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 0.80951, df = 3, p-value = 0.4775
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7324847 0.9505777
## sample estimates:
## cor
## 0.4234072
# cor.test の値と下の値を比較する
#c(rhoL,rhoU)
#t
# 8-3-2
# 追加 多項分布で乱数を発生させたもの
library(tidyverse)
rho <- 0.3
n <-10; alpha <- 0.05
j0 <- matrix(c(1,rho,rho,1),nrow=2)
j1<- rmvnorm(n, mean = c(0, 0), sigma=j0)
r <- cor(j1[,1],j1[,2])
t <- r*sqrt(n-2)/sqrt(1-r^2)
z <- 1/2 * log((1+r)/(1-r))
zL <- z - qnorm(1-alpha/2,0,1) /sqrt(n-3)
zU <- z + qnorm(1- alpha/2,0,1) /sqrt(n-3)
rhoL <- ( exp(2*zL)-1 )/( exp(2*zL)+1 )
rhoU <- ( exp(2*zU)-1 )/( exp(2*zU)+1 )#E2-1-3
library(tidyverse)
x21 <- read_csv("E02P01.csv")
## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
x22 <- table(x21$study,x21$result)
x23 <- prop.table(x22)
x24 <- addmargins(x22)
x24
##
## 合格 不合格 Sum
## 勉強した 50 100 150
## 勉強しなかった 25 25 50
## Sum 75 125 200
#E2-1-4
library(tidyverse)
x21 <- read_csv("E02P01.csv")
## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
x22 <- table(x21$study,x21$result)
x23 <- prop.table(x22)
x24 <- addmargins(x23)
x24
##
## 合格 不合格 Sum
## 勉強した 0.250 0.500 0.750
## 勉強しなかった 0.125 0.125 0.250
## Sum 0.375 0.625 1.000
#2.2.1
library(tidyverse)
x29 <- read_csv("E02P02.csv",col_types="fddd")
ggplot(x29,aes(x="",y=人数,fill=世代))+geom_bar(stat="identity")+
geom_text(aes(y=座標,label=割合))+coord_polar("y",direction=-1)+theme_void()

#2.2.2
library(tidyverse)
q02 <- read_csv("E02P03.csv",col_types="fdd")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
names(q02)
## [1] "Media" "Day" "Time"
ggplot(q02,aes(x=q02$Media,y=q02$Time,fill=q02$Day))+geom_bar(stat="identity")+geom_text(aes(label=q02$Time),vjust=5,position="stack")

#は2.2.3
library(tidyverse)
q03 <- read_csv("E02P04.csv",col_types="dd")
ggplot(q03,aes(x=q03$Year,y=count))+geom_point()+geom_line()+ggtitle("高齢者の交通事故死亡者数の推移")

#2.2.4
library(tidyverse)
q04 <- read_csv("E02P05.csv")
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Age, Height, position
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q05 <- read_csv("E02P05.csv",col_types="fdd")
ggplot(q04)+geom_point(aes(x=q04$Age,y=q04$Height))+geom_line(aes(x=q04$Age,y=q04$Height))

ggplot(q05)+geom_bar(aes(x=q05$Age,y=q05$Height),stat="identity")

ggplot(q05,aes(x="",y=q05$Height,fill=q05$Age))+geom_bar(,stat="identity",position="fill")+geom_text(aes(label=q05$Height,y=position),vjust=1)

ggplot(q05,aes(x="",y=q05$Height,fill=q05$Age))+geom_bar(,stat="identity",position="fill")+geom_text(aes(label=q05$Height,y=position),vjust=1)+coord_polar("y",direction=-1)+theme_void()

#2.2.5
library(tidyverse)
q06 <- read_csv("E02P06.csv")
## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(q06)+geom_bar(aes(x=subjA,y=subjB),stat="identity")

ggplot(q06,aes(x=subjA,y=..density..,))+geom_histogram(bins=20)+geom_density()

ggplot(q06)+geom_point(aes(x=subjA,y=subjB))

ggplot(q06)+geom_line(aes(x=subjA,y=subjB))

#3.1.2
library(tidyverse)
q07 <- read_csv("E03P01.csv")
## Rows: 17 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q08 <- read_csv("E03P02.csv")
## Rows: 8 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): 階級, 階級値
## dbl (1): 度数
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q08
## # A tibble: 8 × 3
## 階級 階級値 度数
## <chr> <chr> <dbl>
## 1 10点以上20点未満 15点 1
## 2 20点以上30点未満 25点 1
## 3 30点以上40点未満 35点 2
## 4 40点以上50点未満 45点 2
## 5 50点以上60点未満 55点 5
## 6 60点以上70点未満 65点 2
## 7 70点以上80点未満 75点 2
## 8 80点以上90点未満 85点 1
#3.1.3
library(tidyverse)
q09 <- read_csv("E03P03.csv")
## Rows: 64 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): sub
## dbl (2): id, score
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
q10 <- read_csv("E03P04.csv")
## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): 点数
## dbl (1): 度数
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(q10,aes(x=点数,y=度数))+geom_bar(stat="identity",width=1)

ggplot(q09)+stat_boxplot(aes(x=sub,y=score),coef=Inf)

#4.3.3
library(tidyverse)
rate <- 2
x1 <- 1
y1 <- dexp(x1,rate)
ggplot(data=data.frame(x=c(0,4)), aes(x=x)) +
stat_function(fun=dexp, args=c(rate=rate) )+geom_area(stat = "function", fun = dexp, args=c(rate=rate),fill = "red", xlim = c(1, 6))+geom_segment(x=0,y=0,xend=0,yend=2.5,colour="black")+geom_segment(x=0,y=0,xend=8,yend=0,colour="black")

#4.3.4
library(tidyverse)
rate <- 2
shape <- 3
x1 <- 1
y1 <- dgamma(x1,shape=shape,rate=rate)
ggplot(data=data.frame(x=c(0,5)), aes(x=x)) +
stat_function(fun=dgamma, args=c(shape=shape,rate=rate) )+
geom_area(stat = "function", fun = dgamma, args=c(shape=shape,rate=rate),fill = "red", xlim = c(0, x1))+geom_segment(x=0,y=0,xend=0,yend=dgamma(0,shape,rate),colour="black")+geom_segment(x=0,y=0,xend=8,yend=0,colour="black")

#4.3.5
library(tidyverse)
mean <- 50
sd <- 10
x1 <- 40
y1 <- dnorm(x1,mean=mean,sd=sd)
ggplot(data=data.frame(x=c(0,100)), aes(x=x)) +
stat_function(fun=dnorm, args=c(mean=mean,sd=sd) ) +
geom_area(stat = "function", fun = dnorm, args=c(mean=mean,sd=sd),fill = "red", xlim = c(0, x1))

#5.2.1
library(tidyverse)
ggplot(data=data.frame(x=c(0,5)), aes(x=x)) +
stat_function(fun=dexp, args=c(rate=1) )

ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
stat_function(fun=dt, args=c(df=10) )

ggplot(data=data.frame(x=c(0,6)), aes(x=x)) +
stat_function(fun=df, args=c(df1=3,df2=10) )

ggplot(data=data.frame(x=c(-0.5,1.5)), aes(x=x)) +
stat_function(fun=dunif, args=c(min=0,max=1) )

#6.3.1
library(tidyverse)
prob <- 0.3
n <- 1001
m <- 100
alpha<- 0.05
est <- numeric(m)
for(j in 1:m) {
est[j] <- sum(rbinom(n,1,prob)/n)
}
z <- tibble(count=1:m,est=est,
lower=est-qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
upper=est+qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
check= ifelse( abs(prob-est) < qnorm(1-alpha/2,0,1)*
sqrt(est*(1-est)/n),"Y","N") )
ggplot(z)+geom_point(aes(x=count,y=est))+
geom_segment(aes(x=count,y=lower,xend=count,yend=upper,colour=check))+
geom_hline(yintercept=prob,colour="red")
