統計檢定

## 原假說在一地區的民眾, 平均年齡是70 歲; 抽樣一百人, 發現這一些樣本的民眾平均年紀是65歲, 標準差是10 歲

### H0: 當地民眾的平均年紀65與假說年紀70的差距是來自於隨機結果
### H1: 當地民眾的平均年紀不應該等於70歲

### Z = (x - mu) / sde
### z = (x - mu) / (sd / sqrt(n)) = > sqrt(n) * (x - mu) / sd
z = (65 - 70) / (10/sqrt(100))
z
## [1] -5
pnorm(-5)
## [1] 2.866516e-07
par(mfrow = c(1,2))
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-3, seq (-3, 1.5 , 0.01 ), 1.5 )
cord.y <-c(0, dnorm(seq (-3, 1.5 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
curve (pnorm (x, 0, 1), xlim =c(-3, 3),  main ="F(z)=P(Z<=z)" )

par(mfrow = c(1,2))
curve(dnorm (x, 0, 1), xlim =c(-3, 3))
cord.x <-c(-3, seq (-3, -2 , 0.01 ), -2 )
cord.y <-c(0, dnorm(seq (-3, -2 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )

cord.x <-c(2, seq (2, 3 , 0.01 ), 3 )
cord.y <-c(0, dnorm(seq (2, 3 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
curve (pnorm (x, 0, 1), xlim =c(-3, 3),  main ="F(z)=P(Z<=z)" )

1- (pnorm(2) - pnorm(-2))
## [1] 0.04550026
par(mfrow = c(1,2))
curve(dnorm (x, 0, 1), xlim =c(-6, 6))
cord.x <-c(-6, seq (-6, -5 , 0.01 ), -5 )
cord.y <-c(0, dnorm(seq (-6, -5 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )

cord.x <-c(5, seq (5, 6 , 0.01 ), 6 )
cord.y <-c(0, dnorm(seq (5, 6 , 0.01 )),0)
polygon(cord.x,cord.y, col="grey" )
curve (pnorm (x, 0, 1), xlim =c(-5, 5),  main ="F(z)=P(Z<=z)" )

1 - (pnorm(5) - pnorm(-5))
## [1] 5.733031e-07
# 0.05 : 95% confidence; accept
# 0.01 : 99% confidence: accept

1- (pnorm(5) - pnorm(-5)) < 0.01
## [1] TRUE
## 原假說在一地區的平均下雨機率是是40% ; 在當地待了一百天以後, 發現該地區的下雨機率是43%

### H0:該地區的下雨機率跟原假說假設的40% 無異
### H1:該地區的下雨機率不等於40% 

### z = (x - mu) / sde

sqrt((0.4*0.6)/100)
## [1] 0.04898979
# sqrt((p0 * (1 - p0) ) / n)

z = (0.43 - 0.40) / 0.05
z
## [1] 0.6
1 - (pnorm(0.6) - pnorm(-0.6))
## [1] 0.5485062
0.5485 < 0.01
## [1] FALSE

Binomial Test

binom.test(x=92, n=315, p=1/6)
## 
##  Exact binomial test
## 
## data:  92 and 315
## number of successes = 92, number of trials = 315, p-value =
## 3.458e-08
## alternative hypothesis: true probability of success is not equal to 0.1666667
## 95 percent confidence interval:
##  0.2424273 0.3456598
## sample estimates:
## probability of success 
##              0.2920635
?binom.test
## starting httpd help server ... done

Student’s t test

## one sample test
x <- c(0.593, 0.142, 0.329, 0.691, 0.231, 0.793, 0.519, 0.392, 0.418)
mean(x)
## [1] 0.4564444
boxplot(x)
abline(h = 0.3, col='red')

?t.test


t.test(x, mu = 0.3)
## 
##  One Sample t-test
## 
## data:  x
## t = 2.2051, df = 8, p-value = 0.05853
## alternative hypothesis: true mean is not equal to 0.3
## 95 percent confidence interval:
##  0.2928381 0.6200508
## sample estimates:
## mean of x 
## 0.4564444
t.test(x, alternative = 'greater', mu = 0.3)
## 
##  One Sample t-test
## 
## data:  x
## t = 2.2051, df = 8, p-value = 0.02927
## alternative hypothesis: true mean is greater than 0.3
## 95 percent confidence interval:
##  0.3245133       Inf
## sample estimates:
## mean of x 
## 0.4564444
## two sample test
Control <- c(91, 87, 99, 77, 88, 91)
Treat   <- c(101, 110, 103, 93, 99, 104)
mu_control <- mean(Control)
mu_treat   <- mean(Treat)

mu_control
## [1] 88.83333
mu_treat
## [1] 101.6667
boxplot(Control, Treat, main = 'Control v.s. Treat Group', names = c('Control', 'Treat'))
abline(h = mu_control, col = 'red')
abline(h = mu_treat, col = 'red')

t.test(Control,Treat,alternative="less")
## 
##  Welch Two Sample t-test
## 
## data:  Control and Treat
## t = -3.4456, df = 9.4797, p-value = 0.003391
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -6.044949
## sample estimates:
## mean of x mean of y 
##  88.83333 101.66667
## paired t-test
druga <- c(16, 20, 21, 22, 23, 22, 27, 25, 27, 28)
drugb <- c(19, 22, 24, 24, 25, 25, 26, 26, 28, 32)

boxplot(druga, drugb)
abline(h = mean(druga), col='red')
abline(h = mean(drugb), col='red')

t.test(druga,drugb,alternative="greater", paired=TRUE)
## 
##  Paired t-test
## 
## data:  druga and drugb
## t = -4.4721, df = 9, p-value = 0.9992
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  -2.819793       Inf
## sample estimates:
## mean of the differences 
##                      -2

Two sample z-test

#給予實驗組A藥: 反應時間是 5 min. 標準差是 0.5
#給予對照組B藥: 反應時間是 3 min. 標準差是 0.4
z = (5 - 3) / sqrt(0.5^ 2 + 0.4 ^ 2)
z
## [1] 3.123475
pnorm(3.12)
## [1] 0.9990957
1 - (pnorm(3.12)- pnorm(-3.12))
## [1] 0.00180851

chi-squared test

obsfreq   <- c(10,15,20)
nullprobs <- c(.333,.333,.334)
?chisq.test

chisq.test(obsfreq, p = nullprobs)
## 
##  Chi-squared test for given probabilities
## 
## data:  obsfreq
## X-squared = 3.3018, df = 2, p-value = 0.1919
dice <- sample(1:6, 100, replace = TRUE)
obsfreq <- table(dice)
probs   <-rep(1/6, 6)
probs
## [1] 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667 0.1666667
chisq.test(obsfreq, p = probs)
## 
##  Chi-squared test for given probabilities
## 
## data:  obsfreq
## X-squared = 2.12, df = 5, p-value = 0.8323
loaded_dice <- sample(1:6, 100, replace = TRUE, prob=c(0.1,0.1,0.1,0.1,0.1,0.5))

obsfreq <- table(loaded_dice)
obsfreq
## loaded_dice
##  1  2  3  4  5  6 
## 12 11  5 17  7 48
chisq.test(obsfreq, p = probs)
## 
##  Chi-squared test for given probabilities
## 
## data:  obsfreq
## X-squared = 75.92, df = 5, p-value = 5.978e-15
barplot(obsfreq)

p <- prop.table(obsfreq)
barplot(p, ylim = c(0,1), main = 'Dice Probability', xlab = 'point', ylab='prob')
abline(h = 1/6, col = 'red')

load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
tb <- table(cdc$smoke100, cdc$gender)
mosaicplot(tb, col =c('blue', 'red'))

tb2 <- table(cdc$gender, cdc$smoke100 )
mosaicplot(tb2, col =c('blue', 'red'))

barplot(tb2, beside = TRUE, col =c('blue', 'red'))

m <- matrix(c(50,50,50,50), nrow = 2)
m
##      [,1] [,2]
## [1,]   50   50
## [2,]   50   50
chisq.test(m)
## 
##  Pearson's Chi-squared test
## 
## data:  m
## X-squared = 0, df = 1, p-value = 1
chisq.test(tb2)
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  tb2
## X-squared = 204.6, df = 1, p-value < 2.2e-16
obsfreq <- matrix(c(15,30, 5,10, 20,40),nrow=2,ncol=3)
chisq.test(obsfreq)
## 
##  Pearson's Chi-squared test
## 
## data:  obsfreq
## X-squared = 0, df = 2, p-value = 1
obsfreq <- matrix(c(20,30, 5,10, 40,40),nrow=2,ncol=3)
chisq.test(obsfreq)
## 
##  Pearson's Chi-squared test
## 
## data:  obsfreq
## X-squared = 2.1378, df = 2, p-value = 0.3434