#8장 통계분석
#1 난수 생성 및 분포 함수
rnorm(100,0,10)
##   [1] -14.77601251  -3.81232647 -10.64954083  22.69045260  14.81752177
##   [6]   0.18367636 -21.73424168  -9.57760442  -5.66393672  11.45177063
##  [11]  -6.85524842  -6.01896571  13.68687219  -3.63735587 -12.18367351
##  [16]   1.75847004  -0.83498749   5.24760387   5.27761896  -1.06869223
##  [21]   1.37984773   3.84148292  16.17173427  -9.04687993  -1.40860357
##  [26]  -2.67249784   0.35162013 -25.67950919  -3.99978659   7.93182019
##  [31] -12.93346663  -9.44829627  13.12031962   3.60974175  -2.29331209
##  [36]  -6.13489311   1.73990610  -0.55654598  -6.49441722  15.99433108
##  [41]  -5.07756863 -12.50476216  14.35740845  -0.06898709  11.12497609
##  [46]   8.30986873   4.74608470  -0.26655509  -3.40866353 -19.00131777
##  [51]  -0.84958467  -8.83739469  -5.17376428  13.86160243   6.08941114
##  [56]   5.28230859  -3.40091871  11.63457618  18.29583016  10.16829252
##  [61]  -7.68475128   3.46872608   8.60293232   2.79425238   6.37990114
##  [66]  -4.41044039 -23.03100006  11.99929898   2.80224819   5.22699308
##  [71]   7.12826396  -6.55660297   8.49845889   3.48478815   6.75216650
##  [76]  16.77418732  12.32382831  -1.41328448  -9.19832419  -6.54922362
##  [81]  11.39656059  13.91009065  -4.54452643  -9.07601779  16.86884152
##  [86]   0.89504694   2.70063886  -8.67978308  -3.04926376  14.61802144
##  [91]  19.01739697  -5.93012543  10.21041331  -3.91348056  -1.27952300
##  [96]   1.57088726  18.43369060   8.48303456  -3.34535979  -9.52267481
plot(density(rnorm(1000000,0,10)))

dpois(3,1)
## [1] 0.06131324
(1^3*exp(-1)) / (factorial(3))
## [1] 0.06131324
pnorm(0)
## [1] 0.5
qnorm(0.5)
## [1] 0
#2 기초통계량
#2.1 평균,표본 분산, 표본 표준편차
mean(1:5)
## [1] 3
var(1:5)
## [1] 2.5
sum((1:5-mean(1:5))^2)/(5-1)
## [1] 2.5
#2.2 다섯 수치 요약
fivenum(1:11)
## [1]  1.0  3.5  6.0  8.5 11.0
summary(1:11)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     1.0     3.5     6.0     6.0     8.5    11.0
x <- 1:10
c(min(x),quantile(x,1/4),median(x),quantile(x,3/4),max(x))
##         25%         75%       
##  1.00  3.25  5.50  7.75 10.00
IQR(1:10)
## [1] 4.5
quantile(1:10,c(1/4,3/4))
##  25%  75% 
## 3.25 7.75
7.75 - 3.25
## [1] 4.5
#2.3 최빈값
x <- factor(c("a","b","c","c","c","d","d"))
x
## [1] a b c c c d d
## Levels: a b c d
table(x)
## x
## a b c d 
## 1 1 3 2
which.max(table(x))
## c 
## 3
names(table(x))[3]
## [1] "c"
#3
#3.1 단순 임의 추출
sample(1:10,5)
## [1]  6  7  9  5 10
sample(1:10,replace=TRUE)
##  [1]  1  9  6  1  2  7  2 10  1  8
sample(1:10,5,replace=TRUE,prob=1:10)
## [1]  8  8 10  4  8
sample(1:10)
##  [1]  7  8 10  4  6  1  9  5  3  2
#3.2 층화 임의 추출
library(sampling)
## Warning: package 'sampling' was built under R version 3.3.3
x <- strata(c("Species"),size=c(3,3,3),method="srswor",data=iris)
x
##        Species ID_unit Prob Stratum
## 1       setosa       1 0.06       1
## 30      setosa      30 0.06       1
## 42      setosa      42 0.06       1
## 53  versicolor      53 0.06       2
## 65  versicolor      65 0.06       2
## 81  versicolor      81 0.06       2
## 109  virginica     109 0.06       3
## 110  virginica     110 0.06       3
## 126  virginica     126 0.06       3
getdata(iris,x)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width    Species ID_unit
## 1            5.1         3.5          1.4         0.2     setosa       1
## 30           4.7         3.2          1.6         0.2     setosa      30
## 42           4.5         2.3          1.3         0.3     setosa      42
## 53           6.9         3.1          4.9         1.5 versicolor      53
## 65           5.6         2.9          3.6         1.3 versicolor      65
## 81           5.5         2.4          3.8         1.1 versicolor      81
## 109          6.7         2.5          5.8         1.8  virginica     109
## 110          7.2         3.6          6.1         2.5  virginica     110
## 126          7.2         3.2          6.0         1.8  virginica     126
##     Prob Stratum
## 1   0.06       1
## 30  0.06       1
## 42  0.06       1
## 53  0.06       2
## 65  0.06       2
## 81  0.06       2
## 109 0.06       3
## 110 0.06       3
## 126 0.06       3
strata(c("Species"),size=c(3,1,1),method="srswr",data=iris)
##        Species ID_unit     Prob Stratum
## 4       setosa       4 0.058808       1
## 5       setosa       5 0.058808       1
## 12      setosa      12 0.058808       1
## 61  versicolor      61 0.020000       2
## 150  virginica     150 0.020000       3
iris$Species2 <- rep(1:2,75)
strata(c("Species","Species2"),size=c(1,1,1,1,1,1),method="srswr",data=iris)
##        Species Species2 ID_unit Prob Stratum
## 27      setosa        1      27 0.04       1
## 4       setosa        2       4 0.04       2
## 75  versicolor        1      75 0.04       3
## 76  versicolor        2      76 0.04       4
## 123  virginica        1     123 0.04       5
## 102  virginica        2     102 0.04       6
library(doBy)
## Warning: package 'doBy' was built under R version 3.3.3

#sampleBy(~Species, frac=.06, data=iris)

#3.3 계통 추출
x <- data.frame(x=1:10)
#sampleBy(~1, frac=.3,data=x,systematic=TRUE)

#4
#4.1 분할표의 작성
table(c("a","b","b","b","c","c","d"))
## 
## a b c d 
## 1 3 2 1
d <- data.frame(x=c("1","2","2","1"),
                y=c("A","B","A","B"),
                num=c(3,5,8,7))
d
##   x y num
## 1 1 A   3
## 2 2 B   5
## 3 2 A   8
## 4 1 B   7
xt <- xtabs(num~x+y,data=d)
xt
##    y
## x   A B
##   1 3 7
##   2 8 5
d2 <- data.frame(x=c("A","A","A","B","B"),result=c(3,2,4,7,6))
xtabs(~x,d2)
## x
## A B 
## 3 2
xt
##    y
## x   A B
##   1 3 7
##   2 8 5
margin.table(xt,1)
## x
##  1  2 
## 10 13
margin.table(xt,2)
## y
##  A  B 
## 11 12
margin.table(xt)
## [1] 23
xt
##    y
## x   A B
##   1 3 7
##   2 8 5
prop.table(xt,1)
##    y
## x           A         B
##   1 0.3000000 0.7000000
##   2 0.6153846 0.3846154
prop.table(xt,2)
##    y
## x           A         B
##   1 0.2727273 0.5833333
##   2 0.7272727 0.4166667
prop.table(xt)
##    y
## x           A         B
##   1 0.1304348 0.3043478
##   2 0.3478261 0.2173913
#4.2독립성 검정
library(MASS)
## Warning: package 'MASS' was built under R version 3.3.3
data(survey)
str(survey)
## 'data.frame':    237 obs. of  12 variables:
##  $ Sex   : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 1 2 1 2 2 ...
##  $ Wr.Hnd: num  18.5 19.5 18 18.8 20 18 17.7 17 20 18.5 ...
##  $ NW.Hnd: num  18 20.5 13.3 18.9 20 17.7 17.7 17.3 19.5 18.5 ...
##  $ W.Hnd : Factor w/ 2 levels "Left","Right": 2 1 2 2 2 2 2 2 2 2 ...
##  $ Fold  : Factor w/ 3 levels "L on R","Neither",..: 3 3 1 3 2 1 1 3 3 3 ...
##  $ Pulse : int  92 104 87 NA 35 64 83 74 72 90 ...
##  $ Clap  : Factor w/ 3 levels "Left","Neither",..: 1 1 2 2 3 3 3 3 3 3 ...
##  $ Exer  : Factor w/ 3 levels "Freq","None",..: 3 2 2 2 3 3 1 1 3 3 ...
##  $ Smoke : Factor w/ 4 levels "Heavy","Never",..: 2 4 3 2 2 2 2 2 2 2 ...
##  $ Height: num  173 178 NA 160 165 ...
##  $ M.I   : Factor w/ 2 levels "Imperial","Metric": 2 1 NA 2 2 1 1 2 2 2 ...
##  $ Age   : num  18.2 17.6 16.9 20.3 23.7 ...
xtabs(~Sex + Exer, data=survey)
##         Exer
## Sex      Freq None Some
##   Female   49   11   58
##   Male     65   13   40
chisq.test(xtabs(~Sex+Exer, data=survey))
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~Sex + Exer, data = survey)
## X-squared = 5.7184, df = 2, p-value = 0.05731
#4.3피셔의 정확 검정
xtabs(~W.Hnd+Clap, data=survey)
##        Clap
## W.Hnd   Left Neither Right
##   Left     9       5     4
##   Right   29      45   143
chisq.test(xtabs(~W.Hnd+Clap, data=survey))
## Warning in chisq.test(xtabs(~W.Hnd + Clap, data = survey)): Chi-squared
## approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  xtabs(~W.Hnd + Clap, data = survey)
## X-squared = 19.252, df = 2, p-value = 6.598e-05
fisher.test(xtabs(~W.Hnd+Clap, data=survey))
## 
##  Fisher's Exact Test for Count Data
## 
## data:  xtabs(~W.Hnd + Clap, data = survey)
## p-value = 0.0001413
## alternative hypothesis: two.sided
#4.4맥니마 검정
Performance <- matrix(c(794,86,150,570),nrow=2,dimnames=list("1st Survey"=c("Approve","Disapprove")))
Performance
##             
## 1st Survey   [,1] [,2]
##   Approve     794  150
##   Disapprove   86  570
mcnemar.test(Performance)
## 
##  McNemar's Chi-squared test with continuity correction
## 
## data:  Performance
## McNemar's chi-squared = 16.818, df = 1, p-value = 4.115e-05
binom.test(86,86+150,.5)
## 
##  Exact binomial test
## 
## data:  86 and 86 + 150
## number of successes = 86, number of trials = 236, p-value =
## 3.716e-05
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.3029404 0.4293268
## sample estimates:
## probability of success 
##              0.3644068
#5
#5.1 Chi Square Test
table(survey$W.Hnd)
## 
##  Left Right 
##    18   218
chisq.test(table(survey$W.Hnd),p=c(.3,.7))
## 
##  Chi-squared test for given probabilities
## 
## data:  table(survey$W.Hnd)
## X-squared = 56.252, df = 1, p-value = 6.376e-14
#5.2 Shapiro-Wilk Test
shapiro.test(rnorm(1000))
## 
##  Shapiro-Wilk normality test
## 
## data:  rnorm(1000)
## W = 0.99948, p-value = 0.9972
#5.3 Kolmogorov-Smirnov Test
ks.test(rnorm(100),rnorm(100))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  rnorm(100) and rnorm(100)
## D = 0.13, p-value = 0.3667
## alternative hypothesis: two-sided
ks.test(rnorm(100),runif(100))
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  rnorm(100) and runif(100)
## D = 0.55, p-value = 1.458e-13
## alternative hypothesis: two-sided
ks.test(rnorm(1000),"pnorm",0,1)
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rnorm(1000)
## D = 0.034287, p-value = 0.1903
## alternative hypothesis: two-sided
#5.4 Q-Q Plot
x <- rnorm(1000,mean=10,sd=1)
qqnorm(x)
qqline(x,lty=2)

x <- rcauchy(1000)
qqnorm(x)
qqline(x,lty=2)

#6
#6.1 피어슨 상관계수
cor(iris$Sepal.Width, iris$Sepal.Length)
## [1] -0.1175698
cor(iris[,1:4])
##              Sepal.Length Sepal.Width Petal.Length Petal.Width
## Sepal.Length    1.0000000  -0.1175698    0.8717538   0.8179411
## Sepal.Width    -0.1175698   1.0000000   -0.4284401  -0.3661259
## Petal.Length    0.8717538  -0.4284401    1.0000000   0.9628654
## Petal.Width     0.8179411  -0.3661259    0.9628654   1.0000000
symnum(cor(iris[,1:4]))
##              S.L S.W P.L P.W
## Sepal.Length 1              
## Sepal.Width      1          
## Petal.Length +   .   1      
## Petal.Width  +   .   B   1  
## attr(,"legend")
## [1] 0 ' ' 0.3 '.' 0.6 ',' 0.8 '+' 0.9 '*' 0.95 'B' 1
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(cor(iris[,1:4]),type="corr",upper.panel=panel.conf)

cor(1:10,1:10)
## [1] 1
cor(1:10,1:10*2)
## [1] 1
#6.2 스피어만 상관계수
x <- c(3,4,5,3,2,1,7,5)
rank(sort(x))
## [1] 1.0 2.0 3.5 3.5 5.0 6.5 6.5 8.0
m <- matrix(c(1:10, (1:10)^2),ncol=2)
m
##       [,1] [,2]
##  [1,]    1    1
##  [2,]    2    4
##  [3,]    3    9
##  [4,]    4   16
##  [5,]    5   25
##  [6,]    6   36
##  [7,]    7   49
##  [8,]    8   64
##  [9,]    9   81
## [10,]   10  100
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 3.3.3
## Loading required package: lattice
## Loading required package: survival
## Warning: package 'survival' was built under R version 3.3.3
## 
## Attaching package: 'survival'
## The following objects are masked from 'package:sampling':
## 
##     cluster, strata
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.3.3
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(m,type="pearson")$r
##           [,1]      [,2]
## [1,] 1.0000000 0.9745586
## [2,] 0.9745586 1.0000000
rcorr(m,type="spearman")$r
##      [,1] [,2]
## [1,]    1    1
## [2,]    1    1
x <- -5:5
y <- (-5:5)^2
x
##  [1] -5 -4 -3 -2 -1  0  1  2  3  4  5
y
##  [1] 25 16  9  4  1  0  1  4  9 16 25
#6.3 켄달의 순위 상관 계수
library(Kendall)
## Warning: package 'Kendall' was built under R version 3.3.3
Kendall(c(1,2,3,4,5),c(1,0,3,4,5))
## tau = 0.8, 2-sided pvalue =0.086411
#6.4 상관 계수 검정
cor.test(c(1,2,3,4,5),c(1,0,3,4,5),method="pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  c(1, 2, 3, 4, 5) and c(1, 0, 3, 4, 5)
## t = 3.9279, df = 3, p-value = 0.02937
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1697938 0.9944622
## sample estimates:
##       cor 
## 0.9149914
cor.test(c(1,2,3,4,5),c(1,0,3,4,5),method="spearman")
## 
##  Spearman's rank correlation rho
## 
## data:  c(1, 2, 3, 4, 5) and c(1, 0, 3, 4, 5)
## S = 2, p-value = 0.08333
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho 
## 0.9
cor.test(c(1,2,3,4,5),c(1,0,3,4,5),method="kendall")
## 
##  Kendall's rank correlation tau
## 
## data:  c(1, 2, 3, 4, 5) and c(1, 0, 3, 4, 5)
## T = 9, p-value = 0.08333
## alternative hypothesis: true tau is not equal to 0
## sample estimates:
## tau 
## 0.8
#7
#7.1 일표본 평균
x <- rnorm(30)
x
##  [1] -1.11546815  0.52928172  1.42080777 -0.69758453 -0.15735960
##  [6]  0.36833558  1.02965890 -1.19108768 -2.32722901 -2.93063986
## [11]  0.14736771 -1.59205023  0.86349539  0.01174044 -1.43826986
## [16]  0.07507787  0.50431428  0.31175476 -0.73587676  0.36086411
## [21]  0.74736293  0.59234875  0.25977449 -0.60978880 -1.02045240
## [26]  1.67095785  0.72962252  0.37619357  0.26653630 -0.88250119
x <- rnorm(30, mean=10)
t.test(x,mu=10)
## 
##  One Sample t-test
## 
## data:  x
## t = 0.86181, df = 29, p-value = 0.3959
## alternative hypothesis: true mean is not equal to 10
## 95 percent confidence interval:
##   9.79899 10.49378
## sample estimates:
## mean of x 
##  10.14638
#7.2 독립 이표본 평균
sleep
##    extra group ID
## 1    0.7     1  1
## 2   -1.6     1  2
## 3   -0.2     1  3
## 4   -1.2     1  4
## 5   -0.1     1  5
## 6    3.4     1  6
## 7    3.7     1  7
## 8    0.8     1  8
## 9    0.0     1  9
## 10   2.0     1 10
## 11   1.9     2  1
## 12   0.8     2  2
## 13   1.1     2  3
## 14   0.1     2  4
## 15  -0.1     2  5
## 16   4.4     2  6
## 17   5.5     2  7
## 18   1.6     2  8
## 19   4.6     2  9
## 20   3.4     2 10
sleep2 <- sleep[,-3]

tapply(sleep2$extra, sleep2$group, mean)
##    1    2 
## 0.75 2.33
library(doBy)
summaryBy(extra~group, sleep2)
##   group extra.mean
## 1     1       0.75
## 2     2       2.33
var.test(extra~group, sleep2)
## 
##  F test to compare two variances
## 
## data:  extra by group
## F = 0.79834, num df = 9, denom df = 9, p-value = 0.7427
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.198297 3.214123
## sample estimates:
## ratio of variances 
##          0.7983426
#7.3 짝지은 이표본 평균
sleep
##    extra group ID
## 1    0.7     1  1
## 2   -1.6     1  2
## 3   -0.2     1  3
## 4   -1.2     1  4
## 5   -0.1     1  5
## 6    3.4     1  6
## 7    3.7     1  7
## 8    0.8     1  8
## 9    0.0     1  9
## 10   2.0     1 10
## 11   1.9     2  1
## 12   0.8     2  2
## 13   1.1     2  3
## 14   0.1     2  4
## 15  -0.1     2  5
## 16   4.4     2  6
## 17   5.5     2  7
## 18   1.6     2  8
## 19   4.6     2  9
## 20   3.4     2 10
with(sleep, t.test(extra[group==1],extra[group==2],paired=TRUE))
## 
##  Paired t-test
## 
## data:  extra[group == 1] and extra[group == 2]
## t = -4.0621, df = 9, p-value = 0.002833
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.4598858 -0.7001142
## sample estimates:
## mean of the differences 
##                   -1.58
#7.4 이표본 분산
with(iris, var.test(Sepal.Width, Sepal.Length))
## 
##  F test to compare two variances
## 
## data:  Sepal.Width and Sepal.Length
## F = 0.27706, num df = 149, denom df = 149, p-value = 3.595e-14
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2007129 0.3824528
## sample estimates:
## ratio of variances 
##          0.2770617
#7.5 일표본 비율
prop.test(42,100)
## 
##  1-sample proportions test with continuity correction
## 
## data:  42 out of 100, null probability 0.5
## X-squared = 2.25, df = 1, p-value = 0.1336
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.3233236 0.5228954
## sample estimates:
##    p 
## 0.42
binom.test(42,100)
## 
##  Exact binomial test
## 
## data:  42 and 100
## number of successes = 42, number of trials = 100, p-value = 0.1332
## alternative hypothesis: true probability of success is not equal to 0.5
## 95 percent confidence interval:
##  0.3219855 0.5228808
## sample estimates:
## probability of success 
##                   0.42
#7.6 이표본 비율
prop.test(c(45,55), c(100,90))
## 
##  2-sample test for equality of proportions with continuity
##  correction
## 
## data:  c(45, 55) out of c(100, 90)
## X-squared = 4.3067, df = 1, p-value = 0.03796
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.31185005 -0.01037217
## sample estimates:
##    prop 1    prop 2 
## 0.4500000 0.6111111