#8장 통계분석
#1 난수 생성 및 분포 함수
rnorm(100,0,10)
## [1] -14.77601251 -3.81232647 -10.64954083 22.69045260 14.81752177
## [6] 0.18367636 -21.73424168 -9.57760442 -5.66393672 11.45177063
## [11] -6.85524842 -6.01896571 13.68687219 -3.63735587 -12.18367351
## [16] 1.75847004 -0.83498749 5.24760387 5.27761896 -1.06869223
## [21] 1.37984773 3.84148292 16.17173427 -9.04687993 -1.40860357
## [26] -2.67249784 0.35162013 -25.67950919 -3.99978659 7.93182019
## [31] -12.93346663 -9.44829627 13.12031962 3.60974175 -2.29331209
## [36] -6.13489311 1.73990610 -0.55654598 -6.49441722 15.99433108
## [41] -5.07756863 -12.50476216 14.35740845 -0.06898709 11.12497609
## [46] 8.30986873 4.74608470 -0.26655509 -3.40866353 -19.00131777
## [51] -0.84958467 -8.83739469 -5.17376428 13.86160243 6.08941114
## [56] 5.28230859 -3.40091871 11.63457618 18.29583016 10.16829252
## [61] -7.68475128 3.46872608 8.60293232 2.79425238 6.37990114
## [66] -4.41044039 -23.03100006 11.99929898 2.80224819 5.22699308
## [71] 7.12826396 -6.55660297 8.49845889 3.48478815 6.75216650
## [76] 16.77418732 12.32382831 -1.41328448 -9.19832419 -6.54922362
## [81] 11.39656059 13.91009065 -4.54452643 -9.07601779 16.86884152
## [86] 0.89504694 2.70063886 -8.67978308 -3.04926376 14.61802144
## [91] 19.01739697 -5.93012543 10.21041331 -3.91348056 -1.27952300
## [96] 1.57088726 18.43369060 8.48303456 -3.34535979 -9.52267481
plot(density(rnorm(1000000,0,10)))
dpois(3,1)
## [1] 0.06131324
(1^3*exp(-1)) / (factorial(3))
## [1] 0.06131324
pnorm(0)
## [1] 0.5
qnorm(0.5)
## [1] 0
#2 기초통계량
#2.1 평균,표본 분산, 표본 표준편차
mean(1:5)
## [1] 3
var(1:5)
## [1] 2.5
sum((1:5-mean(1:5))^2)/(5-1)
## [1] 2.5
#2.2 다섯 수치 요약
fivenum(1:11)
## [1] 1.0 3.5 6.0 8.5 11.0
summary(1:11)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.0 3.5 6.0 6.0 8.5 11.0
x <- 1:10
c(min(x),quantile(x,1/4),median(x),quantile(x,3/4),max(x))
## 25% 75%
## 1.00 3.25 5.50 7.75 10.00
IQR(1:10)
## [1] 4.5
quantile(1:10,c(1/4,3/4))
## 25% 75%
## 3.25 7.75
7.75 - 3.25
## [1] 4.5
#2.3 최빈값
x <- factor(c("a","b","c","c","c","d","d"))
x
## [1] a b c c c d d
## Levels: a b c d
table(x)
## x
## a b c d
## 1 1 3 2
which.max(table(x))
## c
## 3
names(table(x))[3]
## [1] "c"
#3
#3.1 단순 임의 추출
sample(1:10,5)
## [1] 6 7 9 5 10
sample(1:10,replace=TRUE)
## [1] 1 9 6 1 2 7 2 10 1 8
sample(1:10,5,replace=TRUE,prob=1:10)
## [1] 8 8 10 4 8
sample(1:10)
## [1] 7 8 10 4 6 1 9 5 3 2
#3.2 층화 임의 추출
library(sampling)
## Warning: package 'sampling' was built under R version 3.3.3
x <- strata(c("Species"),size=c(3,3,3),method="srswor",data=iris)
x
## Species ID_unit Prob Stratum
## 1 setosa 1 0.06 1
## 30 setosa 30 0.06 1
## 42 setosa 42 0.06 1
## 53 versicolor 53 0.06 2
## 65 versicolor 65 0.06 2
## 81 versicolor 81 0.06 2
## 109 virginica 109 0.06 3
## 110 virginica 110 0.06 3
## 126 virginica 126 0.06 3
getdata(iris,x)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species ID_unit
## 1 5.1 3.5 1.4 0.2 setosa 1
## 30 4.7 3.2 1.6 0.2 setosa 30
## 42 4.5 2.3 1.3 0.3 setosa 42
## 53 6.9 3.1 4.9 1.5 versicolor 53
## 65 5.6 2.9 3.6 1.3 versicolor 65
## 81 5.5 2.4 3.8 1.1 versicolor 81
## 109 6.7 2.5 5.8 1.8 virginica 109
## 110 7.2 3.6 6.1 2.5 virginica 110
## 126 7.2 3.2 6.0 1.8 virginica 126
## Prob Stratum
## 1 0.06 1
## 30 0.06 1
## 42 0.06 1
## 53 0.06 2
## 65 0.06 2
## 81 0.06 2
## 109 0.06 3
## 110 0.06 3
## 126 0.06 3
strata(c("Species"),size=c(3,1,1),method="srswr",data=iris)
## Species ID_unit Prob Stratum
## 4 setosa 4 0.058808 1
## 5 setosa 5 0.058808 1
## 12 setosa 12 0.058808 1
## 61 versicolor 61 0.020000 2
## 150 virginica 150 0.020000 3
iris$Species2 <- rep(1:2,75)
strata(c("Species","Species2"),size=c(1,1,1,1,1,1),method="srswr",data=iris)
## Species Species2 ID_unit Prob Stratum
## 27 setosa 1 27 0.04 1
## 4 setosa 2 4 0.04 2
## 75 versicolor 1 75 0.04 3
## 76 versicolor 2 76 0.04 4
## 123 virginica 1 123 0.04 5
## 102 virginica 2 102 0.04 6
library(doBy)
## Warning: package 'doBy' was built under R version 3.3.3

#sampleBy(~Species, frac=.06, data=iris)
#3.3 계통 추출
x <- data.frame(x=1:10)
#sampleBy(~1, frac=.3,data=x,systematic=TRUE)
#4
#4.1 분할표의 작성
table(c("a","b","b","b","c","c","d"))
##
## a b c d
## 1 3 2 1
d <- data.frame(x=c("1","2","2","1"),
y=c("A","B","A","B"),
num=c(3,5,8,7))
d
## x y num
## 1 1 A 3
## 2 2 B 5
## 3 2 A 8
## 4 1 B 7
xt <- xtabs(num~x+y,data=d)
xt
## y
## x A B
## 1 3 7
## 2 8 5
d2 <- data.frame(x=c("A","A","A","B","B"),result=c(3,2,4,7,6))
xtabs(~x,d2)
## x
## A B
## 3 2
xt
## y
## x A B
## 1 3 7
## 2 8 5
margin.table(xt,1)
## x
## 1 2
## 10 13
margin.table(xt,2)
## y
## A B
## 11 12
margin.table(xt)
## [1] 23
xt
## y
## x A B
## 1 3 7
## 2 8 5
prop.table(xt,1)
## y
## x A B
## 1 0.3000000 0.7000000
## 2 0.6153846 0.3846154
prop.table(xt,2)
## y
## x A B
## 1 0.2727273 0.5833333
## 2 0.7272727 0.4166667
prop.table(xt)
## y
## x A B
## 1 0.1304348 0.3043478
## 2 0.3478261 0.2173913
#4.2독립성 검정
library(MASS)
## Warning: package 'MASS' was built under R version 3.3.3
data(survey)
str(survey)
## 'data.frame': 237 obs. of 12 variables:
## $ Sex : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
## $ Wr.Hnd: num 18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
## $ NW.Hnd: num 18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
## $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
## $ Fold : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
## $ Pulse : int 92 104 87 NA 35 64 83 74 72 90 ...
## $ Clap : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
## $ Exer : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
## $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
## $ Height: num 173 178 NA 160 165 ...
## $ M.I : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
## $ Age : num 18.2 17.6 16.9 20.3 23.7 ...
xtabs(~Sex + Exer, data=survey)
## Exer
## Sex Freq None Some
## Female 49 11 58
## Male 65 13 40
chisq.test(xtabs(~Sex+Exer, data=survey))
##
## Pearson's Chi-squared test
##
## data: xtabs(~Sex + Exer, data = survey)
## X-squared = 5.7184, df = 2, p-value = 0.05731
#4.3피셔의 정확 검정
xtabs(~W.Hnd+Clap, data=survey)
## Clap
## W.Hnd Left Neither Right
## Left 9 5 4
## Right 29 45 143
chisq.test(xtabs(~W.Hnd+Clap, data=survey))
## Warning in chisq.test(xtabs(~W.Hnd + Clap, data = survey)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: xtabs(~W.Hnd + Clap, data = survey)
## X-squared = 19.252, df = 2, p-value = 6.598e-05
fisher.test(xtabs(~W.Hnd+Clap, data=survey))
##
## Fisher's Exact Test for Count Data
##
## data: xtabs(~W.Hnd + Clap, data = survey)
## p-value = 0.0001413
## alternative hypothesis: two.sided
#4.4맥니마 검정
Performance <- matrix(c(794,86,150,570),nrow=2,dimnames=list("1st Survey"=c("Approve","Disapprove")))
Performance
##
## 1st Survey [,1] [,2]
## Approve 794 150
## Disapprove 86 570
mcnemar.test(Performance)
##
## McNemar's Chi-squared test with continuity correction
##
## data: Performance
## McNemar's chi-squared = 16.818, df = 1, p-value = 4.115e-05
binom.test(86,86+150,.5)
##
## Exact binomial test
##
## data: 86 and 86 + 150
## number of successes = 86, number of trials = 236, p-value =
## 3.716e-05
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.3029404 0.4293268
## sample estimates:
## probability of success
## 0.3644068
#5
#5.1 Chi Square Test
table(survey$W.Hnd)
##
## Left Right
## 18 218
chisq.test(table(survey$W.Hnd),p=c(.3,.7))
##
## Chi-squared test for given probabilities
##
## data: table(survey$W.Hnd)
## X-squared = 56.252, df = 1, p-value = 6.376e-14
#5.2 Shapiro-Wilk Test
shapiro.test(rnorm(1000))
##
## Shapiro-Wilk normality test
##
## data: rnorm(1000)
## W = 0.99948, p-value = 0.9972
#5.3 Kolmogorov-Smirnov Test
ks.test(rnorm(100),rnorm(100))
##
## Two-sample Kolmogorov-Smirnov test
##
## data: rnorm(100) and rnorm(100)
## D = 0.13, p-value = 0.3667
## alternative hypothesis: two-sided
ks.test(rnorm(100),runif(100))
##
## Two-sample Kolmogorov-Smirnov test
##
## data: rnorm(100) and runif(100)
## D = 0.55, p-value = 1.458e-13
## alternative hypothesis: two-sided
ks.test(rnorm(1000),"pnorm",0,1)
##
## One-sample Kolmogorov-Smirnov test
##
## data: rnorm(1000)
## D = 0.034287, p-value = 0.1903
## alternative hypothesis: two-sided
#5.4 Q-Q Plot
x <- rnorm(1000,mean=10,sd=1)
qqnorm(x)
qqline(x,lty=2)

x <- rcauchy(1000)
qqnorm(x)
qqline(x,lty=2)

#6
#6.1 피어슨 상관계수
cor(iris$Sepal.Width, iris$Sepal.Length)
## [1] -0.1175698
cor(iris[,1:4])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length 1.0000000 -0.1175698 0.8717538 0.8179411
## Sepal.Width -0.1175698 1.0000000 -0.4284401 -0.3661259
## Petal.Length 0.8717538 -0.4284401 1.0000000 0.9628654
## Petal.Width 0.8179411 -0.3661259 0.9628654 1.0000000
symnum(cor(iris[,1:4]))
## S.L S.W P.L P.W
## Sepal.Length 1
## Sepal.Width 1
## Petal.Length + . 1
## Petal.Width + . B 1
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(cor(iris[,1:4]),type="corr",upper.panel=panel.conf)

cor(1:10,1:10)
## [1] 1
cor(1:10,1:10*2)
## [1] 1
#6.2 스피어만 상관계수
x <- c(3,4,5,3,2,1,7,5)
rank(sort(x))
## [1] 1.0 2.0 3.5 3.5 5.0 6.5 6.5 8.0
m <- matrix(c(1:10, (1:10)^2),ncol=2)
m
## [,1] [,2]
## [1,] 1 1
## [2,] 2 4
## [3,] 3 9
## [4,] 4 16
## [5,] 5 25
## [6,] 6 36
## [7,] 7 49
## [8,] 8 64
## [9,] 9 81
## [10,] 10 100
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.3.3
##
## Attaching package: 'survival'
## The following objects are masked from 'package:sampling':
##
## cluster, strata
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(m,type="pearson")$r
## [,1] [,2]
## [1,] 1.0000000 0.9745586
## [2,] 0.9745586 1.0000000
rcorr(m,type="spearman")$r
## [,1] [,2]
## [1,] 1 1
## [2,] 1 1
x <- -5:5
y <- (-5:5)^2
x
## [1] -5 -4 -3 -2 -1 0 1 2 3 4 5
y
## [1] 25 16 9 4 1 0 1 4 9 16 25
#6.3 켄달의 순위 상관 계수
library(Kendall)
## Warning: package 'Kendall' was built under R version 3.3.3
Kendall(c(1,2,3,4,5),c(1,0,3,4,5))
## tau = 0.8, 2-sided pvalue =0.086411
#6.4 상관 계수 검정
cor.test(c(1,2,3,4,5),c(1,0,3,4,5),method="pearson")
##
## Pearson's product-moment correlation
##
## data: c(1, 2, 3, 4, 5) and c(1, 0, 3, 4, 5)
## t = 3.9279, df = 3, p-value = 0.02937
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1697938 0.9944622
## sample estimates:
## cor
## 0.9149914
cor.test(c(1,2,3,4,5),c(1,0,3,4,5),method="spearman")
##
## Spearman's rank correlation rho
##
## data: c(1, 2, 3, 4, 5) and c(1, 0, 3, 4, 5)
## S = 2, p-value = 0.08333
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9
cor.test(c(1,2,3,4,5),c(1,0,3,4,5),method="kendall")
##
## Kendall's rank correlation tau
##
## data: c(1, 2, 3, 4, 5) and c(1, 0, 3, 4, 5)
## T = 9, p-value = 0.08333
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau
## 0.8
#7
#7.1 일표본 평균
x <- rnorm(30)
x
## [1] -1.11546815 0.52928172 1.42080777 -0.69758453 -0.15735960
## [6] 0.36833558 1.02965890 -1.19108768 -2.32722901 -2.93063986
## [11] 0.14736771 -1.59205023 0.86349539 0.01174044 -1.43826986
## [16] 0.07507787 0.50431428 0.31175476 -0.73587676 0.36086411
## [21] 0.74736293 0.59234875 0.25977449 -0.60978880 -1.02045240
## [26] 1.67095785 0.72962252 0.37619357 0.26653630 -0.88250119
x <- rnorm(30, mean=10)
t.test(x,mu=10)
##
## One Sample t-test
##
## data: x
## t = 0.86181, df = 29, p-value = 0.3959
## alternative hypothesis: true mean is not equal to 10
## 95 percent confidence interval:
## 9.79899 10.49378
## sample estimates:
## mean of x
## 10.14638
#7.2 독립 이표본 평균
sleep
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
## 7 3.7 1 7
## 8 0.8 1 8
## 9 0.0 1 9
## 10 2.0 1 10
## 11 1.9 2 1
## 12 0.8 2 2
## 13 1.1 2 3
## 14 0.1 2 4
## 15 -0.1 2 5
## 16 4.4 2 6
## 17 5.5 2 7
## 18 1.6 2 8
## 19 4.6 2 9
## 20 3.4 2 10
sleep2 <- sleep[,-3]
tapply(sleep2$extra, sleep2$group, mean)
## 1 2
## 0.75 2.33
library(doBy)
summaryBy(extra~group, sleep2)
## group extra.mean
## 1 1 0.75
## 2 2 2.33
var.test(extra~group, sleep2)
##
## F test to compare two variances
##
## data: extra by group
## F = 0.79834, num df = 9, denom df = 9, p-value = 0.7427
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.198297 3.214123
## sample estimates:
## ratio of variances
## 0.7983426
#7.3 짝지은 이표본 평균
sleep
## extra group ID
## 1 0.7 1 1
## 2 -1.6 1 2
## 3 -0.2 1 3
## 4 -1.2 1 4
## 5 -0.1 1 5
## 6 3.4 1 6
## 7 3.7 1 7
## 8 0.8 1 8
## 9 0.0 1 9
## 10 2.0 1 10
## 11 1.9 2 1
## 12 0.8 2 2
## 13 1.1 2 3
## 14 0.1 2 4
## 15 -0.1 2 5
## 16 4.4 2 6
## 17 5.5 2 7
## 18 1.6 2 8
## 19 4.6 2 9
## 20 3.4 2 10
with(sleep, t.test(extra[group==1],extra[group==2],paired=TRUE))
##
## Paired t-test
##
## data: extra[group == 1] and extra[group == 2]
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -2.4598858 -0.7001142
## sample estimates:
## mean of the differences
## -1.58
#7.4 이표본 분산
with(iris, var.test(Sepal.Width, Sepal.Length))
##
## F test to compare two variances
##
## data: Sepal.Width and Sepal.Length
## F = 0.27706, num df = 149, denom df = 149, p-value = 3.595e-14
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.2007129 0.3824528
## sample estimates:
## ratio of variances
## 0.2770617
#7.5 일표본 비율
prop.test(42,100)
##
## 1-sample proportions test with continuity correction
##
## data: 42 out of 100, null probability 0.5
## X-squared = 2.25, df = 1, p-value = 0.1336
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.3233236 0.5228954
## sample estimates:
## p
## 0.42
binom.test(42,100)
##
## Exact binomial test
##
## data: 42 and 100
## number of successes = 42, number of trials = 100, p-value = 0.1332
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
## 0.3219855 0.5228808
## sample estimates:
## probability of success
## 0.42
#7.6 이표본 비율
prop.test(c(45,55), c(100,90))
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: c(45, 55) out of c(100, 90)
## X-squared = 4.3067, df = 1, p-value = 0.03796
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.31185005 -0.01037217
## sample estimates:
## prop 1 prop 2
## 0.4500000 0.6111111