RProgram.knit

#1-3-4
library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.5.0 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

# library(readr) 
# でもよい
a01 <- read_table2("A01L01.txt")

## Warning: `read_table2()` was deprecated in readr 2.0.0.
## ℹ Please use `read_table()` instead.

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   name1 = col_character(),
##   A = col_double(),
##   B = col_double()
## )

a02 <- read_csv("A01L02.csv")

## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): C, D
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

a03 <- read_tsv("A01L03.tsv")

## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): name1
## dbl (2): height, weight
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#2-1-5
# クロス集計
library(readr)
# library(tidyverse) でも良い。その場合も下は必要
# %>% を使うため
library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

#
b01 <- read_csv("A02L01.csv")

## Rows: 200 Columns: 3

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

#
# もし文字化けする場合に文字コードを指定するには
#
#b01 <-  read_csv("A02L01.csv",locale=locale(encoding="UTF-8"))
#b01 <-  read_csv("A02L01.csv",locale=locale(encoding="CP932"))
#
b02 <- table( b01$study, b01$result) 
prop.table( b02 )

##                 
##                   合格 不合格
##   勉強した       0.500  0.250
##   勉強しなかった 0.125  0.125

prop.table( b02, 1)

##                 
##                       合格    不合格
##   勉強した       0.6666667 0.3333333
##   勉強しなかった 0.5000000 0.5000000

prop.table( b02, 2)

##                 
##                       合格    不合格
##   勉強した       0.8000000 0.6666667
##   勉強しなかった 0.2000000 0.3333333

addmargins(b02 )

##                 
##                  合格 不合格 Sum
##   勉強した        100     50 150
##   勉強しなかった   25     25  50
##   Sum             125     75 200

addmargins(b02, 1)

##                 
##                  合格 不合格
##   勉強した        100     50
##   勉強しなかった   25     25
##   Sum             125     75

addmargins(b02, 2)

##                 
##                  合格 不合格 Sum
##   勉強した        100     50 150
##   勉強しなかった   25     25  50

table(b01[,1:2] ) %>%  prop.table() %>% addmargins()

##                 result
## study             合格 不合格   Sum
##   勉強した       0.500  0.250 0.750
##   勉強しなかった 0.125  0.125 0.250
##   Sum            0.625  0.375 1.000

#2-2-2 (1)
library(ggplot2)
a01 <- read_csv("A02L02.csv")

## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): A, B
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(a01, aes(x=A, y=B) ) + 
    geom_point()

#ggplot(a01, aes(x=A, y=B) ) + 
#  geom_point()+theme_bw()

#2-2-2 (2)
library(tidyverse)
library(ggplot2)
ggplot(a01, aes(x=A, y=B)) +
  geom_line() +
  labs(x= "height", y="weight") +
  theme(text = element_text(size=16) )

# 2-2-2 (3)
library(tidyverse)
# a01 <- read_csv("A02L02.csv")
# 上をすでに行っている前提で
ggplot(a01,aes(x=A, y=B) )+ 
  geom_point()+geom_line()+
  theme(text = element_text(size=16) )

# size=16 は文字のサイズ

# 2-2-3
c01 <- read_csv("A02L03.csv")

## Rows: 7 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): A
## dbl (2): freq, pos1
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(c01,aes(x=A, y= freq ) ) + 
  geom_bar(stat="identity")

#2-2-3 R
#ggplot(b01,aes(x=study))+geom_bar()# 2-2-3 (2)
library(tidyverse)
p1 <- ggplot(c01,aes(x="",y= freq, fill=A) ) + 
  geom_bar(stat="identity", position="fill") + 
  geom_text(aes(label=freq, y=pos1))
p1

# 2-2-3 (3)
# library(tidyverse)
#p1 <- ggplot(c01,aes(x="",y= freq, fill=A) ) + 
#  geom_bar(stat="identity", position="fill") + 
#  geom_text(aes(label=freq, y=pos1))
# これがすでに行われている前提で
p2 <- p1 + 
  coord_polar("y", direction=-1) +
  theme_void()
p2

# 2-2-4
#library(tidyverse)
c02 <- read_csv("A02L04.csv")

## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(c02,aes(x= subjA, y= ..density..,)) + 
  geom_histogram() +  geom_density()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 赤いメッセージは binの幅をを変更した方が良いというメッセージ
#ggplot(c02,aes(x= subjA, y= ..density..,)) + 
#  geom_histogram(bins=20) +  geom_density()

#3-1-4
library(tidyverse)
d01 <- read_csv("A03L01.csv")

## Rows: 200 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): subject
## dbl (1): score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

d02 <- read_csv("A03L02.csv")

## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(d01)+geom_boxplot(aes(x=subject,y=score))

summary(d02)

##      subjA          subjB      
##  Min.   :39.0   Min.   :17.00  
##  1st Qu.:53.0   1st Qu.:54.00  
##  Median :61.5   Median :62.00  
##  Mean   :60.3   Mean   :61.68  
##  3rd Qu.:67.0   3rd Qu.:69.00  
##  Max.   :82.0   Max.   :99.00

cor(d02)

##           subjA     subjB
## subjA 1.0000000 0.1636067
## subjB 0.1636067 1.0000000

var(d02)

##          subjA     subjB
## subjA 89.42424  18.18788
## subjB 18.18788 138.19960

library(tidyverse)
dbinom(3,10,0.1)

## [1] 0.05739563

# ?dgeom(0,0.2)
f1 <- tibble(x=0:100, y=dbinom(seq(0,100,1),100,0.2))
ggplot(f1) +geom_bar(aes(x=x,y=y),stat="identity")

#
f2 <- tibble(x=0:1000,y=dbinom(seq(0,1000,1),1000,0.2))
ggplot(f2)+geom_bar(aes(x=x,y=y),stat="identity")+
scale_x_continuous(limits=c(150,250))

## Warning: Removed 900 rows containing missing values (`position_stack()`).

## Warning: Removed 2 rows containing missing values (`geom_bar()`).

#4-3-3
ggplot(data=data.frame(x=c(-1,1.5)), aes(x=x)) +
  stat_function(fun=dexp, args=c(rate=1) )

#ggplot(data=data.frame(x=c(-0.5,4.5)), aes(x=x)) +
#  stat_function(fun=dexp, args=c(rate=2) )
#
#ggplot(data=data.frame(x=c(-1,15)), aes(x=x)) +
#  stat_function(fun=dgamma, args=c(shape=4,rate=2) )
#
#ggplot( data.frame(x = rnorm(1000) ) )+
#  geom_histogram(aes(x=x,y=..density..)) +
#  stat_function(fun = dnorm, colour = "red")

#4-3-3
library(tidyverse)
ggplot(data=data.frame(x=c(-1,15)), aes(x=x)) +
  stat_function(fun=dgamma, args=c(shape=4,rate=2) )

library(tidyverse)
n <- 20
alpha <- 0.05
calpha1 <- qchisq(alpha/2,n)
calpha2 <- qchisq(1-alpha/2,n)
y1 <- dchisq(calpha1,n)
y2 <- dchisq(calpha2,n)
ggplot(data=data.frame(x=c(-1,100)),aes(x=x) )+
    stat_function(fun=dchisq,args=c(df=n) )+geom_segment(x=calpha1,y=0,xend=calpha1,yend=y1,color="red")+geom_segment(x=calpha2,y=0,xend=calpha2,yend=y2,color="red")

library(tidyverse)
n <- 30
alpha <- 0.05
talpha1 <- qt(alpha/2,n)
talpha2 <- qt(1-alpha/2,n)
y1 <- dt(talpha1,n)
y2 <- dt(talpha2,n)
ggplot(data=data.frame(x=c(-5,5)),aes(x=x) )+
    stat_function(fun=dt,args=c(df=n) )+
  geom_segment(x=talpha1,y=0,xend=talpha1,yend=y1,color="red")+
  geom_segment(x=talpha2,y=0,xend=talpha2,yend=y2,color="red")

library(tidyverse)
m <- 15
n <- 30
falpha1 <- qf(0.025,m,n) #1/qf(0.975,n,m) (nとmの入れ替え) 動画は0.95 ですが...
falpha2 <- qf(0.975,m,n)
y1 <- df(falpha1,m,n)
y2 <- df(falpha2,m,n)
ggplot(data=data.frame(x=c(-1,5)),aes(x=x) )+
    stat_function(fun=df,args=c(df1=m,df2=n) )+
  geom_segment(x=falpha1,y=0,xend=falpha1,yend=y1,color="red")+
  geom_segment(x=falpha2,y=0,xend=falpha2,yend=y2,color="red")

# 5-3-2
library(tidyverse)
n <- 50000
p <- 0.3
bt <- tibble(count=1:n,trial=rbinom(n,1,p),success=cumsum(trial),prob=success/count)
ggplot(bt,aes(x=count,y=prob))+
  ylim(0,1)+geom_hline(yintercept=p, colour="red")+geom_line()

# ylim はy軸の範囲
ggplot(bt,aes(x=count,y=prob))+
  ylim(0,0.5)+geom_hline(yintercept=p, colour="red")+geom_line()

# 5-3-3
library(tidyverse)
#各回の乱数の個数
n <- 1000
# ヒストグラムを書くための繰り返しの数
m <- 10000
# 一様乱数のためのパラメータ
a <- -10
b <- 10
#試しに一回
g2 <- tibble(x=runif(n,a,b))
ggplot(g2)+geom_histogram(aes(x=x,y=..density..),bins=50)

#
g3 <- numeric(m)
for(i in 1:m){
  g3[i] <- ( mean(runif(n,a,b)) -(a+b)/2 )  / ( sqrt((b-a)^2/12) /sqrt(n) )  
}
g4 <- tibble(x=g3)
ggplot(g4)+geom_histogram(aes(x=x,y=..density..),bins=50)

library(tidyverse)
#
n <- 3
m <- 1000

# 5-3-4
g4 <- numeric(m)
for(i in 1:m){
  g4[i] <- ( sum(rnorm(n,0,1)^2) ) 
}
g5 <- tibble(x=g4)
ggplot(g5)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dchisq, args=c(df=n) )

#ggplot(data=data.frame(x=c(-1,10)), aes(x=x)) +
#  stat_function(fun=dchisq, args=c(df=3))
library(tidyverse)

library(tidyverse)
# 平均a 標準偏差が b の乱数を n個作る
n <- 20
a <- 10
b <- 5
m <- 1000
# 配列の変数を定義するためのもの
g6 <- numeric(n)
g7 <- numeric(m)
#
for(i in 1:m){
  g6 <- rnorm(n,a,b)
  barx <- mean(g6)
  g7[i] <- sum( (g6-a)/(b*sqrt(n)) )/ sqrt( sum((g6-barx)^2/b^2 ) / (n-1) )
}
g8 <- tibble(x=g7)
ggplot(g8)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dt, args=c(df=(n-1)) )

library(tidyverse)
m <- 1000
n1 <- 16; a1 <- 10; b1 <- 10
n2 <- 30; a2 <- 30; b2 <- 40

ge <- numeric(m)
for(i in 1:m){
  ga <-rnorm(n1,a1,b1)
  gb <-rnorm(n2,a2,b2)
  gc <- sum( (ga-mean(ga))^2 / (b1^2))
  gd <- sum( (gb-mean(gb))^2 / (b2^2) )
  #  n1+ n2個から２つできる
  # そのgcとgd をgeに入れていく
  ge[i] <- (gc/ (n1-1) ) / (gd / (n2-1) )
}
# ge としてはm個できる。これをヒストグラムにしたものとF分布の形を比較
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=df, args=c(df1=(n1-1),df2=(n2-1)))

# 5-3-1
ggplot(data=data.frame(x=c(-1,5)), aes(x=x)) +
    stat_function(fun=df, args=c(df1=30,df2=20) )

#m <-15 
#n <- 30
#alpha1 <- qf(0.025,m,n) # これは 1/qf(0.975,n,m) （nとmを入れ替え）
#alpha2 <- qf(0.975,m,n)
#y1 <- df(alpha1,m,n)
#y2 <- df(alpha2,m,n)
#ggplot(data=data.frame(x=c(-1,5)), aes(x=x)) +
#  stat_function(fun=df, args=c(df1=m,df2=n) )+geom_segment(x=alpha1,y=0,xend=alpha1,yend=y1, colour="red") + geom_segment(x=alpha2,y=0,xend=alpha2,yend=y2,colour="red") + geom_segment(x=alpha1,y=0,xend=alpha1,yend=y1,colour="red") + geom_segment(x=alpha2,y=0,xend=alpha2,yend=y2,colour="red")

# 5-3-2
n <- 50000
p <- 0.3
bt <- tibble(count=1:n,trial=rbinom(n,1,p),success=cumsum(trial),prob=success/count)
ggplot(bt,aes(x=count,y=prob))+ylim(0,1)+geom_hline(yintercept=p, colour="red")+geom_line()

# 5-3-3
#各回の乱数の個数
n <- 1000
# ヒストグラムを書くための繰り返しの数
m <- 10000
# 一様乱数のためのパラメータ
a <- -10
b <- 10
#試しに一回
g2 <- tibble(x=runif(n,a,b))
ggplot(g2)+geom_histogram(aes(x=x,y=..density..),bins=50)

#
g3 <- numeric(m)
for(i in 1:m){
  g3[i] <- ( mean(runif(n,a,b)) -(a+b)/2 )  / ( sqrt((b-a)^2/12) /sqrt(n) )  
}
g4 <- tibble(x=g3)
ggplot(g4)+geom_histogram(aes(x=x,y=..density..),bins=50)

#
n <- 3
m <- 1000

# 5-3-4
g4 <- numeric(m)
for(i in 1:m){
  g4[i] <- ( sum(rnorm(n,0,1)^2) ) 
}
g5 <- tibble(x=g4)
ggplot(g5)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dchisq, args=c(df=n) )

#ggplot(data=data.frame(x=c(-1,10)), aes(x=x)) +
#  stat_function(fun=dchisq, args=c(df=3))

#5-3-5
#library(tidyverse)
a <- 10;b <- 5
n <- 20;m <- 1000
#
g6 <- numeric(n)
g7 <- numeric(m)
for(i in 1:m){
  g6 <- rnorm(n,a,b)
  barx <- mean(g6)
  g7[i] <- sum( (g6-a)/(b*sqrt(n)) )/ sqrt( sum((g6-barx)^2/b^2 ) / (n-1) )
}
g8 <- tibble(x=g7)
ggplot(g8)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=dt, args=c(df=(n-1)) )

# 5-3-6
#library(tidyverse)
m <- 1000
n1 <- 60; n2 <- 70
a1 <- 10; b1 <- 10; a2 <- 30;b2 <- 40

ge <- numeric(m)
for(i in 1:m){
  # 平均a1,標準偏差b1 の正規分布に従う乱数をn1個
  # 平均a2,標準偏差b2 の正規分布に従う乱数をn2個
  ga <-rnorm(n1,a1,b1)
  gb <-rnorm(n2,a2,b2)
  # そこからgc とgd を作る
  gc <- sum( (ga-mean(ga))^2 / (b1^2))
  gd <- sum( (gb-mean(gb))^2 / (b2^2) )
  #  n1+ n2個から２つできる
  # そのgcとgd をgeに入れていく
  ge[i] <- (gc/ (n1-1) ) / (gd / (n2-1) )
}
# ge としてはm個できる。これをヒストグラムにしたものとF分布の形を比較
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=df, args=c(df1=(n1-1),df2=(n2-1)))

# 5-3-6
#library(tidyverse)
m <- 1000
n1 <- 6; a1 <- 10;b1 <- 10
n2 <- 7; a2 <- 30;b2 <- 40
ge <- numeric(m)
for(i in 1:m){
  ga <-rnorm(n1,a1,b1) 
  gb <-rnorm(n2,a2,b2) 
  gc <- sum( (ga-mean(ga))^2 / (b1^2))
  gd <- sum( (gb-mean(gb))^2 / (b2^2) )
  ge[i] <- (gc/ (n1-1) ) / (gd / (n2-1) )
}
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+stat_function(fun=df, args=c(df1=(n1-1),df2=(n2-1)))

# 6-2-7 
# Macで文字が□となる場合以下をコメントアウト
#theme_set(theme_minimal(base_size=16,base_family="sans") ) 
#
alpha <- 0.05
p1 <- 1-alpha/2
x1 <- qnorm(alpha/2,0,1)
y1 <- dnorm(x1,0,1)
x2 <- qnorm(1-alpha/2,0,1)
y2 <- dnorm(x2,0,1)
gg <- ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
  stat_function(fun=dnorm, args=c(mean=0,sd=1) )+geom_segment(x=x1,y=0,xend=x1,yend=y1,colour="red")+geom_segment(x=x2,y=0,xend=x2,yend=y2,colour="red")+annotate("text",x=0,y=0.3,label=expression(paste(over(alpha,2))))
gg

## Warning in is.na(x): is.na() applied to non-(list or vector) of type
## 'expression'

#plot(data=data.frame(x=c(-4,4)), aes(x=x)) +
  #stat_function(fun=dnorm, args=c(mean=0,sd=1) )+geom_segment(x=x1,y=0,xend=x1,yend=y1,colour="red")+geom_segment(x=x2,y=0,xend=x2,yend=y2,colour="red")+annotate("text",x=0,y=0.3,label=expression(paste(over(alpha,2))))+annotate("text",x=2,y=0.3,label=expression(paste(over(alpha,2))))

# 6-3-1
library(tidyverse)
prob <- 0.3
#１回の標本のサンプル数（割り切れないほうが良い）
n <- 1001
#実験の回数
m <- 100 
#信頼係数
alpha<- 0.05
#estimate（推定量)
est <- numeric(m)
for(j in 1:m) {
  est[j]  <- sum(rbinom(n,1,prob)/n)
} 
#結果をtibbleで表の形にする
z <- tibble(count=1:m,est=est,
            lower=est-qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
            upper=est+qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
            check= ifelse( abs(prob-est) < qnorm(1-alpha/2,0,1)*
                             sqrt(est*(1-est)/n),"Y","N")  )
#グラフ
ggplot(z)+geom_point(aes(x=count,y=est))+
  geom_segment(aes(x=count,y=lower,xend=count,yend=upper,colour=check))+
  geom_hline(yintercept=prob,colour="red")+ylim(0.1,0.5)

#t分布のグラフ　7-2-3 
ggplot(data=data.frame(x=c(-5,5)), aes(x=x)) +
  stat_function(fun=dt, args=c(df=2),linetype="solid")+
  stat_function(fun=dt, args=c(df=4),linetype="dashed")+
  stat_function(fun=dt, args=c(df=10),linetype="longdash")+
  stat_function(fun=dnorm,colour="red")

#7-1-4
# スライドのグラフ
library(tidyverse)
prob <- 0.5
n <- 500
u0 <- n*prob
s0 <- sqrt(n*prob*(1-prob))
qnorm(0.025,mean=u0,sd=s0)

## [1] 228.0869

i1 <- tibble(x=200:300,y=dbinom(200:300,n,prob),z=c(rep("N",29),rep("Y",43),rep("N",29)))
ggplot(i1)+geom_bar(aes(x=x,y=y,fill=z),stat="identity")+stat_function(fun=dnorm,args=c(mean=u0,sd=s0),color="red") + theme(legend.position = 'none')+scale_fill_manual(values = c("red","black"))

#7-1-4
library(tidyverse)
prob <- 0.5
n <- 500
u0 <- n*prob
s0 <- sqrt(n*prob*(1-prob))
qnorm(0.95,mean=u0,sd=s0)

## [1] 268.39

i3 <- tibble(x=200:300,y=dbinom(200:300,n,prob),z=c(rep("Y",69),rep("N",32)))
ggplot(i3)+geom_bar(aes(x=x,y=y,fill=z),stat="identity")+stat_function(fun=dnorm,args=c(mean=u0,sd=s0),color="red") + theme(legend.position = 'none')+scale_fill_manual(values = c("red","black"))

# 7-2-1
# スライドの演習
x <- c(97.7,94.7,91.6,98.0,102.9,94.5,99.1,95.3,93.8,102.0)
z1 <- sum(x-100)/(sqrt(10)*5)
t1 <- (mean(x)-100)/(sqrt(var(x))/sqrt(10))
y1 <- dt(t1,9)
p<- 2*pt(t1,9)

# 7-2-2
#library(tidyverse)
#ggplot(data=data.frame(x=c(-5,5)), aes(x=x)) +
#  stat_function(fun=dt, args=c(df=2),linetype="solid")+
#  stat_function(fun=dt, args=c(df=4),linetype="dashed")+
#  stat_function(fun=dt, args=c(df=10),linetype="longdash")+
#  stat_function(fun=dnorm,colour="red")

# 7-3-1
library(tidyverse)
x <- c(97.7,94.7,91.6,98.0,102.9,94.5,99.1,95.3,93.8,102.0)
z1 <- sum(x-100)/(sqrt(10)*5)
t1 <- (mean(x)-100)/(sqrt(var(x))/sqrt(10))
y1 <- dt(t1,9)
ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
  stat_function(fun=dt,args=c(df=9))+
  geom_segment(x=t1,y=0,xend=t1,yend=y1,colour="red")+
  geom_segment(x=-t1,y=0,xend=-t1,yend=y1,colour="red")

p<- 2*pt(t1,9)
t.test(x,mu=100,alternative="two.sided")

## 
##  One Sample t-test
## 
## data:  x
## t = -2.638, df = 9, p-value = 0.027
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
##  94.35307 99.56693
## sample estimates:
## mean of x 
##     96.96

#t.test(x,mu=100,alternative="greater")
#t.test(x,mu=100,alternative="two.sided")

# 7-3-2
library(tidyverse)
subX <- c(66,73,65,64,59,64,66,63,64,74)
subY <- c(61,64,69,65,70,76,74,82,79,72)
ex1  <- tibble(subX=subX,subY=subY)
ex2 <- ex1 %>% pivot_longer(subX:subY,names_to="sub",values_to="score")
ggplot(ex2)+geom_boxplot(aes(x=sub,y=score))

var.test(ex1$subX,ex1$subY)

## 
##  F test to compare two variances
## 
## data:  ex1$subX and ex1$subY
## F = 0.44824, num df = 9, denom df = 9, p-value = 0.2477
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.111337 1.804620
## sample estimates:
## ratio of variances 
##          0.4482422

t.test(ex1$subX,ex1$subY,paired=FALSE,var.equal = FALSE)

## 
##  Welch Two Sample t-test
## 
## data:  ex1$subX and ex1$subY
## t = -2.1034, df = 15.718, p-value = 0.05191
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.85039961   0.05039961
## sample estimates:
## mean of x mean of y 
##      65.8      71.2

t.test(ex1$subX,ex1$subY,paired=FALSE,var.equal = TRUE)

## 
##  Two Sample t-test
## 
## data:  ex1$subX and ex1$subY
## t = -2.1034, df = 18, p-value = 0.04976
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -10.793730633  -0.006269367
## sample estimates:
## mean of x mean of y 
##      65.8      71.2

# 8-1-3 
library( mvtnorm )

## Warning: package 'mvtnorm' was built under R version 4.2.3

n1 <- 14;n2 <- 15;n3 <- 16
#set.seet(3)
#x <- round(rnorm(n1,65,6),0)
#y <- round(rnorm(n2,68,6.5),0)
#z <- round(rnorm(n3,62,5.5),0)
x <- c(58,59,61,64,67,67,67,67,68,68,70,71,72,73)
y <- c(54,56,56,57,58,59,59,60,61,62,63,66,68,70,70)
z <- c(52,53,56,59,60,60,61,62,64,65,65,66,68,70,75,76)
#x <-c(51,56,58 62 62 64 65 66 70 72)
#y <-c(56, 62 64 64 68 72 73 74 74 75)
# 点数と ABCでクラス分け
h1 <- tibble(score=c(x,y,z),class1=c(rep("A",n1),rep("B",n2),rep("C",n3))) 
#中身を確認する場合には以下のコマンドを打つ
#summary(h1)
#h1
# 点数と LMHでクラス分け
# group_by はclass1ごとに分けて平均点を計算
h1 <- h1 %>% mutate(class2 = ifelse(score<68,ifelse(score<61,"L","M"),"H") ) %>% 
    mutate(class2=factor(class2,levels=c("L","M","H"))) %>%
    group_by(class1) %>% mutate(mean=mean(score))
ggplot(h1)+geom_point(aes(x=class1,y=score))+geom_line(aes(x=class1,y=score))+
    geom_point(aes(x=class1,y=mean),colour="red")+
    geom_hline(yintercept=mean(h1$score),colour="blue")

# 検定
h3 <- aov(score~class1,data=h1)
summary(h3)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## class1       2  207.2  103.61   3.196  0.051 .
## Residuals   42 1361.4   32.41                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# クラス2 で平均点を計算し直す
h1 <- h1 %>%  group_by(class2) %>% mutate(mean=mean(score))
ggplot(h1)+geom_point(aes(x=class2,y=score))+geom_line(aes(x=class2,y=score))+
  geom_point(aes(x=class2,y=mean),colour="red")+
  geom_hline(yintercept=mean(h1$score),colour="blue")

# 検定
h4 <- aov(score~class2,data=h1)
summary(h4)

##             Df Sum Sq Mean Sq F value Pr(>F)    
## class2       2 1305.8   652.9   104.4 <2e-16 ***
## Residuals   42  262.8     6.3                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# h3 と比較
oneway.test(score~class1,data=h1,var.equal=TRUE)

## 
##  One-way analysis of means
## 
## data:  score and class1
## F = 3.1965, num df = 2, denom df = 42, p-value = 0.05103

oneway.test(score~class1,data=h1)

## 
##  One-way analysis of means (not assuming equal variances)
## 
## data:  score and class1
## F = 4.2777, num df = 2.000, denom df = 27.848, p-value = 0.02399

# 追加(h4と比較する場合）
#oneway.test(score~class2,data=h1,var.equal=TRUE)
#oneway.test(score~class2,data=h1)

# 8-2-2 のスライドのグラフ
library(tidyverse)
n <- 30
x1 <- qchisq(0.95,n)
y1 <- dchisq(x1,n)
ggplot(data=data.frame(x=c(-1,60)), aes(x=x)) +
  stat_function(fun=dchisq, args=c(df=n))+
  geom_segment(x=x1,y=0,xend=x1,yend=y1)

# 8-2-3 Part1
p1 <- c(0.16,0.18,0.2,0.22,0.24)
p2 <- c(0.2,0.2,0.2,0.2,0.2)
n <- 100
i1 <-  rmultinom(1,n,p1)
chisq.test(i1,p=p2)

## 
##  Chi-squared test for given probabilities
## 
## data:  i1
## X-squared = 4.7, df = 4, p-value = 0.3195

# 8-2-3 Part2
library(tidyverse)
b01 <- read_csv("A02L01.csv")

## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

b02 <- table( b01$study, b01$result) 
summary(b02)

## Number of cases in table: 200 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 4.444, df = 1, p-value = 0.03501

chisq.test(b02)

## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  b02
## X-squared = 3.7618, df = 1, p-value = 0.05244

chisq.test(b02,correct=FALSE)

## 
##  Pearson's Chi-squared test
## 
## data:  b02
## X-squared = 4.4444, df = 1, p-value = 0.03501

# イェーツの補正をするかしないかを明示する
#chisq.test(b02,correct=TRUE)

# 8-3-2
library(tidyverse)
n <- 5;alpha <- 0.05
# 5個のサンプルデータ
x <- c(1.275,0.592,1.714,-1.888,1.641)
y <- c(-0.112,-0.212,0.575,0.088,0.890)
# corで相関を計算
r <- cor(x,y)
t <- r * sqrt( n-2 )/sqrt( 1- r^2 )
z <- 1/2 * log( (1 + r )/ ( 1 - r ))
zL <- z - qnorm(1- alpha/2,0,1) /sqrt(n-3)
zU <- z + qnorm(1- alpha/2,0,1) /sqrt(n-3)
rhoL <- ( exp(2*zL)-1 )/( exp(2*zL)+1 )
rhoU <- ( exp(2*zU)-1 )/( exp(2*zU)+1 )
cor.test(x,y)

## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 0.80951, df = 3, p-value = 0.4775
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.7324847  0.9505777
## sample estimates:
##       cor 
## 0.4234072

# cor.test の値と下の値を比較する
#c(rhoL,rhoU)
#t

# 8-3-2
# 追加　多項分布で乱数を発生させたもの
library(tidyverse)
rho <- 0.3
n <-10;   alpha <- 0.05
j0 <- matrix(c(1,rho,rho,1),nrow=2)
j1<- rmvnorm(n, mean = c(0, 0), sigma=j0)
r <- cor(j1[,1],j1[,2])
t <- r*sqrt(n-2)/sqrt(1-r^2)
z <- 1/2 * log((1+r)/(1-r))
zL <- z - qnorm(1-alpha/2,0,1) /sqrt(n-3)
zU <- z + qnorm(1- alpha/2,0,1) /sqrt(n-3)
rhoL <- ( exp(2*zL)-1 )/( exp(2*zL)+1 )
rhoU <- ( exp(2*zU)-1 )/( exp(2*zU)+1 )#E2-1-3
library(tidyverse)
x21 <- read_csv("E02P01.csv")

## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

x22 <- table(x21$study,x21$result)
x23 <- prop.table(x22)
x24 <- addmargins(x22)
x24

##                 
##                  合格 不合格 Sum
##   勉強した         50    100 150
##   勉強しなかった   25     25  50
##   Sum              75    125 200

#E2-1-4
library(tidyverse)
x21 <- read_csv("E02P01.csv")

## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

x22 <- table(x21$study,x21$result)
x23 <- prop.table(x22)
x24 <- addmargins(x23)
x24

##                 
##                   合格 不合格   Sum
##   勉強した       0.250  0.500 0.750
##   勉強しなかった 0.125  0.125 0.250
##   Sum            0.375  0.625 1.000

#2.2.1
library(tidyverse)
x29 <- read_csv("E02P02.csv",col_types="fddd")
ggplot(x29,aes(x="",y=人数,fill=世代))+geom_bar(stat="identity")+
geom_text(aes(y=座標,label=割合))+coord_polar("y",direction=-1)+theme_void()

#2.2.2
library(tidyverse)
q02 <- read_csv("E02P03.csv",col_types="fdd")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

names(q02)

## [1] "Media" "Day"   "Time"

ggplot(q02,aes(x=q02$Media,y=q02$Time,fill=q02$Day))+geom_bar(stat="identity")+geom_text(aes(label=q02$Time),vjust=5,position="stack")

#は2.2.3
library(tidyverse)
q03 <- read_csv("E02P04.csv",col_types="dd")
ggplot(q03,aes(x=q03$Year,y=count))+geom_point()+geom_line()+ggtitle("高齢者の交通事故死亡者数の推移")

#2.2.4
library(tidyverse)
q04 <- read_csv("E02P05.csv")

## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Age, Height, position
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q05 <- read_csv("E02P05.csv",col_types="fdd")
ggplot(q04)+geom_point(aes(x=q04$Age,y=q04$Height))+geom_line(aes(x=q04$Age,y=q04$Height))

ggplot(q05)+geom_bar(aes(x=q05$Age,y=q05$Height),stat="identity")

ggplot(q05,aes(x="",y=q05$Height,fill=q05$Age))+geom_bar(,stat="identity",position="fill")+geom_text(aes(label=q05$Height,y=position),vjust=1)

ggplot(q05,aes(x="",y=q05$Height,fill=q05$Age))+geom_bar(,stat="identity",position="fill")+geom_text(aes(label=q05$Height,y=position),vjust=1)+coord_polar("y",direction=-1)+theme_void()

#2.2.5
library(tidyverse)
q06 <- read_csv("E02P06.csv")

## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(q06)+geom_bar(aes(x=subjA,y=subjB),stat="identity")

ggplot(q06,aes(x=subjA,y=..density..,))+geom_histogram(bins=20)+geom_density()

ggplot(q06)+geom_point(aes(x=subjA,y=subjB))

ggplot(q06)+geom_line(aes(x=subjA,y=subjB))

#3.1.2
library(tidyverse)
q07 <- read_csv("E03P01.csv")

## Rows: 17 Columns: 1
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (1): score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q08 <- read_csv("E03P02.csv")

## Rows: 8 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): 階級, 階級値
## dbl (1): 度数
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q08

## # A tibble: 8 × 3
##   階級             階級値  度数
##   <chr>            <chr>  <dbl>
## 1 10点以上20点未満 15点       1
## 2 20点以上30点未満 25点       1
## 3 30点以上40点未満 35点       2
## 4 40点以上50点未満 45点       2
## 5 50点以上60点未満 55点       5
## 6 60点以上70点未満 65点       2
## 7 70点以上80点未満 75点       2
## 8 80点以上90点未満 85点       1

#3.1.3
library(tidyverse)
q09 <- read_csv("E03P03.csv")

## Rows: 64 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): sub
## dbl (2): id, score
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

q10 <- read_csv("E03P04.csv")

## Rows: 6 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): 点数
## dbl (1): 度数
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ggplot(q10,aes(x=点数,y=度数))+geom_bar(stat="identity",width=1)

ggplot(q09)+stat_boxplot(aes(x=sub,y=score),coef=Inf)

#4.3.3
library(tidyverse)
rate <- 2
x1 <- 1
y1 <- dexp(x1,rate)
ggplot(data=data.frame(x=c(0,4)), aes(x=x)) +
  stat_function(fun=dexp, args=c(rate=rate) )+geom_area(stat = "function", fun = dexp, args=c(rate=rate),fill = "red", xlim = c(1, 6))+geom_segment(x=0,y=0,xend=0,yend=2.5,colour="black")+geom_segment(x=0,y=0,xend=8,yend=0,colour="black")

#4.3.4
library(tidyverse)
rate <- 2
shape <- 3
x1 <- 1
y1 <- dgamma(x1,shape=shape,rate=rate)
ggplot(data=data.frame(x=c(0,5)), aes(x=x)) +
  stat_function(fun=dgamma, args=c(shape=shape,rate=rate) )+
  geom_area(stat = "function", fun = dgamma, args=c(shape=shape,rate=rate),fill = "red", xlim = c(0, x1))+geom_segment(x=0,y=0,xend=0,yend=dgamma(0,shape,rate),colour="black")+geom_segment(x=0,y=0,xend=8,yend=0,colour="black")

#4.3.5
library(tidyverse)
mean <- 50
sd <- 10
x1 <- 40
y1 <- dnorm(x1,mean=mean,sd=sd)
ggplot(data=data.frame(x=c(0,100)), aes(x=x)) +
  stat_function(fun=dnorm, args=c(mean=mean,sd=sd) ) +
  geom_area(stat = "function", fun = dnorm, args=c(mean=mean,sd=sd),fill = "red", xlim = c(0, x1))

#5.2.1
library(tidyverse)
ggplot(data=data.frame(x=c(0,5)), aes(x=x)) +
  stat_function(fun=dexp, args=c(rate=1) )

ggplot(data=data.frame(x=c(-4,4)), aes(x=x)) +
  stat_function(fun=dt, args=c(df=10) )

ggplot(data=data.frame(x=c(0,6)), aes(x=x)) +
  stat_function(fun=df, args=c(df1=3,df2=10) )

ggplot(data=data.frame(x=c(-0.5,1.5)), aes(x=x)) +
  stat_function(fun=dunif, args=c(min=0,max=1) )

#6.3.1
library(tidyverse)
prob <- 0.3
n <- 1001
m <- 100 
alpha<- 0.05
est <- numeric(m)

for(j in 1:m) {
  est[j]  <- sum(rbinom(n,1,prob)/n)
} 
z <- tibble(count=1:m,est=est,
            lower=est-qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
            upper=est+qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
            check= ifelse( abs(prob-est) < qnorm(1-alpha/2,0,1)*
                             sqrt(est*(1-est)/n),"Y","N")  )
ggplot(z)+geom_point(aes(x=count,y=est))+
  geom_segment(aes(x=count,y=lower,xend=count,yend=upper,colour=check))+
  geom_hline(yintercept=prob,colour="red")