options(encoding = "utf-8")
b01 <- read.csv("A02L01.csv")
b02 <- table(b01$study,b01$result)
b02
##
## 合格 不合格
## 勉強した 100 50
## 勉強しなかった 25 25
第1回 Rの基本操作
## 2-3 データフレーム
h <- c(148,152,154,158)
w <- c(52,54,56,58)
D <- data.frame(h,w)
D
## h w
## 1 148 52
## 2 152 54
## 3 154 56
## 4 158 58
names(D)
## [1] "h" "w"
## 3-1 パッケージの利用(tidyverse)
# install.packages("tidyverse")
# install.packages("knitr")
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## 3-2 tibble (データフレームの改良版:tidyverse)
h <- c(148,152,154,158)
w <- c(52,54,56,58)
d <- tibble(h,w)
d
## # A tibble: 4 × 2
## h w
## <dbl> <dbl>
## 1 148 52
## 2 152 54
## 3 154 56
## 4 158 58
## 3-4 ファイルの読み込み(tidyverse)(1)
library(tidyverse)
a01 <- read_table2("A01L01.txt")
## Warning: `read_table2()` was deprecated in readr 2.0.0.
## ℹ Please use `read_table()` instead.
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## name1 = col_character(),
## A = col_double(),
## B = col_double()
## )
a02 <- read_csv("A01L02.csv")
## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): C, D
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
a03 <- read_tsv("A01L03.tsv")
## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): name1
## dbl (2): height, weight
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## 3-4 ファイルの読み込み(tidyverse)(2)
# ?read.csv
# ?read_csv
第2回 データの要約とレポート作成
## 1-5 表の作成
b01 <- read_csv("A02L01.csv")
## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
b02 <- table(b01$study,b01$result)
# b02 <- table(b01[,1:2])
prop.table(b02)
##
## 合格 不合格
## 勉強した 0.500 0.250
## 勉強しなかった 0.125 0.125
prop.table(b02,1)
##
## 合格 不合格
## 勉強した 0.6666667 0.3333333
## 勉強しなかった 0.5000000 0.5000000
prop.table(b02,2)
##
## 合格 不合格
## 勉強した 0.8000000 0.6666667
## 勉強しなかった 0.2000000 0.3333333
addmargins(b02)
##
## 合格 不合格 Sum
## 勉強した 100 50 150
## 勉強しなかった 25 25 50
## Sum 125 75 200
addmargins(b02,1)
##
## 合格 不合格
## 勉強した 100 50
## 勉強しなかった 25 25
## Sum 125 75
addmargins(b02,2)
##
## 合格 不合格 Sum
## 勉強した 100 50 150
## 勉強しなかった 25 25 50
table(b01[,1:2]) %>% prop.table() %>% addmargins()
## result
## study 合格 不合格 Sum
## 勉強した 0.500 0.250 0.750
## 勉強しなかった 0.125 0.125 0.250
## Sum 0.625 0.375 1.000
## 2-1 グラフの作成(2)
library(ggplot2)
# ggplot() +
# geom_point()
# geom_line()
# geom_bar()
# geom_histogram()
# geom_box()
# stat_smooth()
# theme()
## 2-2 散布図、折れ線グラフ(1)
a01 <- read_csv("A02L02.csv")
## Rows: 5 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): name1
## dbl (2): A, B
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(a01,aes(x=A,y=B)) +
geom_point()
a01
## # A tibble: 5 × 3
## name1 A B
## <chr> <dbl> <dbl>
## 1 A1 148 52
## 2 A2 152 54
## 3 A3 154 56
## 4 A4 158 58
## 5 A5 163 60
## 2-2 散布図、折れ線グラフ(2)
ggplot(a01,aes(x=A,y=B)) +
geom_line() +
labs(x="hight",y="weight") +
theme(text=element_text(size=16))
## 2-2 散布図、折れ線グラフ(3)
ggplot(a01,aes(x=A,y=B)) +
geom_point() + geom_line() +
theme(text=element_text(size=16))
## 2-3 棒グラフ、積み上げ棒グラフ、円グラフ(1)
c01 <- read_csv("A02L03.csv")
## Rows: 7 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): A
## dbl (2): freq, pos1
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(c01,aes(x=A,y=freq)) +
geom_bar(stat="identity")
ggplot(b01,aes(x=study)) +
geom_bar()
## 2-3 棒グラフ、積み上げ棒グラフ、円グラフ(2)
p1 <- ggplot(c01,aes(x="",y=freq,fill=A)) +
geom_bar(stat="identity",position="fill")
geom_text(aes(label=freq,y=pos1))
## mapping: y = ~pos1, label = ~freq
## geom_text: parse = FALSE, check_overlap = FALSE, na.rm = FALSE
## stat_identity: na.rm = FALSE
## position_identity
p1
c01[,3] <- 1-(cumsum(c01$freq)-(0.5*c01$freq))
## 2-3 棒グラフ、積み上げ棒グラフ、円グラフ(3)
p2 <- p1 +
coord_polar("y",direction=-1) +
theme_void()
p2
## 2-4 ヒストグラム、密度グラフ
c02 <- read_csv("A02L04.csv")
## Rows: 100 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (2): subjA, subjB
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ggplot(c02,aes(x=subjA,y=..density..)) +
geom_histogram() +
geom_density()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
第3回 記述統計と確立
## 1-4 集団の特徴を表現する量(3)
library(ggplot2)
d01 <- read.csv("A03L01.csv")
ggplot(d01) +
geom_boxplot(aes(x=subject,y=score))
## 1-4 集団の特徴を表現する量(6)
d02 <- read.csv("A03L02.csv")
var(d02)
## subjA subjB
## subjA 89.42424 18.18788
## subjB 18.18788 138.19960
cor(d02)
## subjA subjB
## subjA 1.0000000 0.1636067
## subjB 0.1636067 1.0000000
第4回 確率分布①
## 3-3 Rによるシミュリーション
## 離散型
library(tidyverse)
f1 <- tibble(x=0:10,y=dgeom(seq(0,10,1),0.2))
ggplot(f1) + geom_bar(aes(x=x,y=y),stat="identity")
## 連続型
ggplot(data=data.frame(x=c(-1,1,5)),aes(x=x)) +
stat_function(fun=dexp,args=c(rate=1))
ggplot(data.frame(x=rnorm(1000))) +
geom_histogram(aes(x=x,y=..density..)) +
stat_function(fun=dnorm,color="red")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
第5回 確率分布②
## 3-1 Rにおける分布関数と密度関数の描画
ggplot(data=data.frame(x=c(-1,10)),aes(x=x)) +
stat_function(fun=df,args=list(df1=30,df2=20))
## 3-2 大数の弱法則
n <- 50000
p <- 0.3
g1 <- tibble(count=1:n,trial=rbinom(n,1,p),
success=cumsum(trial),prob=success/count)
ggplot(g1,aes(x=count,y=prob))+ylim(0,1)+
geom_hline(yintercept=p, colour="red")+geom_line()
## ③一3中心極限定理
n <- 1000
m <- 10000
a <- -10;b <- 10
g2 <- numeric(m)
for(i in 1:m){
g2[i] <- (mean(runif(n,a,b))-(a+b)/2)/(sqrt((b-a)^2/12)/sqrt(n))
}
g3 <- tibble(x=g2)
ggplot(g3)+geom_histogram(aes(x=x,y=..density..),bins=50)
## ③一χ2分布
n <- 3
m <- 1000
#
g4 <- numeric(m)
for(i in 1:m){
g4[i] <- (sum(rnorm(n,0,1)^2))
}
g5 <- tibble(x=g4)
ggplot(g5)+geom_histogram(aes(x=x,y=..density..),bins=50)+
stat_function(fun=dchisq,args=c(df=n))
## ③一5 t分布
a <- 10;b <- 5;
n <- 20;m <- 1000
g6 <- numeric(n)
g7 <- numeric(m)
for(i in 1:m){
g6 <- rnorm(n,a,b);barx <- mean(g6)
g7[i] <- sum((g6-a)/(b*sqrt(n)))/sqrt(sum((g6-barx)^2/b^2)/(n-1))
}
g8 <- tibble(x=g7)
ggplot(g8)+geom_histogram(aes(x=x,y=..density..),bins=50)+
stat_function(fun=dt,args=c(df=(n-1)))
## ③一6 F分布
m <- 1000
n1 <- 60;n2 <- 70;
a1 <- 10;b1<-10;a2 <- 30;b2 <- 40
ge <- numeric(m)
for (i in 1:m){
ga <- rnorm(n1,a1,b1);gb <- rnorm(n2,a2,b2)
gc <- sum((ga-mean(ga))^2/(b1^2));gd <- sum((gb-mean(gb))^2/(b2^2))
ge[i] <- (gc/(n1-1))/(gd/(n2-1))
}
gf <- tibble(x=ge)
ggplot(gf)+geom_histogram(aes(x=x,y=..density..),bins=50)+
stat_function(fun=df,args=c(df1=(n1-1),df2=(n2-1)))
第6回 統計的推定
## ③-1 区間推定
prob <- 0.3
n <- 1001
m <- 100
alpha <- 0.05
est <- numeric(m)
for(j in 1:m){
est[j] <- sum(rbinom(n,1,prob)/n)
}
z <- tibble(count=1:m,est=est,
lower=est - qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
upper=est + qnorm(1-alpha/2,0,1)*sqrt(est*(1-est)/n),
check=ifelse(abs(prob-est) < qnorm(1-alpha/2,0,1)*
sqrt(est*(1-est)/n),"Y","N"))
## グラフ
ggplot(z)+geom_point(aes(x=count,y=est))+
geom_segment(aes(x=count,y=lower,xend=count,yend=upper,colour=check))+
geom_hline(yintercept=prob,colour="red")
第7回 統計的仮説検定①
## ①-4 片側検定と両側検定(2)
## 二項分布を正規分布で近似
prob <- 0.5;n <- 500;u0 <- n*prob;s0 <- sqrt(n*prob*(1-prob))
qnorm(0.025,mean=u0,sd=s0) #228.0869
## [1] 228.0869
qnorm(0.05,mean=u0,sd=s0) #231.61
## [1] 231.61
qnorm(0.95,mean=u0,sd=s0) #268.39
## [1] 268.39
qnorm(0.975,mean=u0,sd=s0) #271.9131
## [1] 271.9131
## ①-4 片側検定と両側検定(3)
i3 <- tibble(x=200:300,y=dbinom(200:300,n,prob),z=c(rep("Y",69),rep("N",32)))
ggplot(i3)+geom_bar(aes(x=x,y=y,fill=z),stat="identity")+
stat_function(fun=dnorm,args=c(mean=u0,sd=s0),color="red")+
theme(legend.position="none")+
scale_fill_manual(values=c("red","black"))
group1 <- c(0.7,-1.6,-0.2,-1.2,-0.1,3.4,3.7,0.8,0.0,2.0) # 睡眠時間の増加
t.test(group1, mu=0)
##
## One Sample t-test
##
## data: group1
## t = 1.3257, df = 9, p-value = 0.2176
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.5297804 2.0297804
## sample estimates:
## mean of x
## 0.75
## ③-1 1標本の母平均の検定
x <- c(97.7,94.7,91.6,98.0,102.9,94.5,99.1,95.3,93.8,102.0)
z1 <- sum(x-100)/(sqrt(10)*5)
t1 <- (mean(x)-100)/(sqrt(var(x))/sqrt(10))
y1 <- dt(t1,9)
p <- 2*pt(t1,9)
ggplot(data=data.frame(x=c(-4,4)),aes(x=x))+
stat_function(fun=dt,args=c(df=9))+
geom_segment(x=t1,y=0,xend=t1,yend=y1,colour="red")+
geom_segment(x=-t1,y=0,xend=-t1,yend=y1,colour="red")
t.test(x, mu=100)
##
## One Sample t-test
##
## data: x
## t = -2.638, df = 9, p-value = 0.027
## alternative hypothesis: true mean is not equal to 100
## 95 percent confidence interval:
## 94.35307 99.56693
## sample estimates:
## mean of x
## 96.96
## ③-2 2標本の母平均の差の検定
subX <- c(66,73,65,64,59,64,66,63,64,74)
subY <- c(61,64,69,65,70,76,74,82,79,72)
var.test(subX,subY)
##
## F test to compare two variances
##
## data: subX and subY
## F = 0.44824, num df = 9, denom df = 9, p-value = 0.2477
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.111337 1.804620
## sample estimates:
## ratio of variances
## 0.4482422
t.test(subX,subY,paired=FALSE,var.equal=FALSE)
##
## Welch Two Sample t-test
##
## data: subX and subY
## t = -2.1034, df = 15.718, p-value = 0.05191
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.85039961 0.05039961
## sample estimates:
## mean of x mean of y
## 65.8 71.2
t.test(subX,subY,paired=FALSE,var.equal=TRUE)
##
## Two Sample t-test
##
## data: subX and subY
## t = -2.1034, df = 18, p-value = 0.04976
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -10.793730633 -0.006269367
## sample estimates:
## mean of x mean of y
## 65.8 71.2
t.test(subX,subY,paired=TRUE)
##
## Paired t-test
##
## data: subX and subY
## t = -1.8701, df = 9, p-value = 0.09428
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## -11.932026 1.132026
## sample estimates:
## mean difference
## -5.4
第8回 統計的仮説検定②
## ①-3 Rによるシミュレーション(1)
x <- c(58,59,61,64,67,67,67,67,68,68,70,71,72,73)
y <- c(54,56,56,57,58,59,59,60,61,62,63,66,68,70,70)
z <- c(52,53,56,59,60,60,61,62,64,65,65,66,68,70,75,76)
na <- length(x);nb <- length(y);nc <- length(z)
h1 <- tibble(score=c(x,y,z),class1=c(rep("A",na),rep("B",nb),rep("C",nc)))
h1 <- h1 %>% mutate(class2=
ifelse(score<68,ifelse(score<61,"L","M"),"H")) %>%
mutate(class2=factor(class2,levels=c("L","M","H")))
h3 <- aov(score~class1,data=h1); summary(h3)
## Df Sum Sq Mean Sq F value Pr(>F)
## class1 2 207.2 103.61 3.196 0.051 .
## Residuals 42 1361.4 32.41
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
h4 <- aov(score~class2,data=h1); summary(h4)
## Df Sum Sq Mean Sq F value Pr(>F)
## class2 2 1305.8 652.9 104.4 <2e-16 ***
## Residuals 42 262.8 6.3
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
oneway.test(score~class1,data=h1,var.equal=TRUE)
##
## One-way analysis of means
##
## data: score and class1
## F = 3.1965, num df = 2, denom df = 42, p-value = 0.05103
oneway.test(score~class1,data=h1)
##
## One-way analysis of means (not assuming equal variances)
##
## data: score and class1
## F = 4.2777, num df = 2.000, denom df = 27.848, p-value = 0.02399
## ②-3 Rによるシミュレーション
p1 <- c(0.18,0.19,0.2,0.21,0.22)
p2 <- c(0.2,0.2,0.2,0.2,0.2)
n <- 100
i1 <- rmultinom(1,n,p1)
chisq.test(i1,p=p2)
##
## Chi-squared test for given probabilities
##
## data: i1
## X-squared = 0.8, df = 4, p-value = 0.9384
b01 <- read_csv("A02L01.csv")
## Rows: 200 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): study, result, gender
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
b02 <- table(b01$study,b01$result)
summary(b02)
## Number of cases in table: 200
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 4.444, df = 1, p-value = 0.03501
chisq.test(b02)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: b02
## X-squared = 3.7618, df = 1, p-value = 0.05244
chisq.test(b02,correct=FALSE)
##
## Pearson's Chi-squared test
##
## data: b02
## X-squared = 4.4444, df = 1, p-value = 0.03501
## ③-2 相関の検定
n <- 5;alpha <- 0.05
x <- c(1.275,0.592,-1.714,-1.888,1.641)
y <- c(-0.112,-0.212,0.575,0.088,0.890)
r <- cor(x,y)
t <- r*sqrt(n-2)/sqrt(1-r^2)
z <- 1/2*log((1+r)/(1-r))
zL<- z-qnorm(1-alpha/2,0,1)/sqrt(n-3)
zU <- z+qnorm(1-alpha/2,0,1)/sqrt(n-3)
rhoL <- (exp(2*zL)-1)/(exp(2*zL)+1)
rhoU <- (exp(2*zU)-1)/(exp(2*zU)+1)
cor.test(x,y)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 0.035026, df = 3, p-value = 0.9743
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8777047 0.8866683
## sample estimates:
## cor
## 0.02021788