Demo20180523

Probability

?sample

## starting httpd help server ... done

1:10

##  [1]  1  2  3  4  5  6  7  8  9 10

sample(1:10)

##  [1]  7  4  3  5  9  1  6 10  8  2

sample(1:10, size = 5)

## [1]  2 10  4  9  6

sample(1:10, size = 5, replace=TRUE)

## [1]  3  1 10  7  7

#sample(c(0,1), size = 10)
sample(c(0,1), size = 10, replace=TRUE)

##  [1] 0 0 1 1 1 1 0 0 1 1

sample(c(0,1), size = 10, replace=TRUE, prob = c(0.7,0.3))

##  [1] 0 0 0 0 0 0 0 0 0 1

dataset <- c(90,80,55,66,77,88,99,53,87,95)
length(dataset)

## [1] 10

idx     <- c(1,1,1,1,1,1,1,0,0,0)
dataset[idx == 1]

## [1] 90 80 55 66 77 88 99

dataset[idx == 0]

## [1] 53 87 95

idx <- sample(c(0,1), size = 10, replace=TRUE, prob = c(0.3,0.7))
idx

##  [1] 1 0 1 1 1 1 1 1 0 1

dataset[idx == 1]

## [1] 90 55 66 77 88 99 53 95

dataset[idx == 0]

## [1] 80 87

sample(1:42, size= 6)

## [1]  8 40 42 11 18 35

set.seed(123)
sample(1:42, size= 6)

## [1] 13 33 17 35 36  2

sample(1:42, size= 6)

## [1] 23 37 42 18 41 17

sample(1:42, size= 6)

## [1] 29 24  5 36 10  2

head(.Random.seed)

## [1]        403         18  515190382 2133433928  917665867 1283494313

?sample.int

sample.int(42,6)

## [1] 14 40 36 28 25 37

coins     <- c('heads', 'tails')
fair_coin <- sample(coins, size = 1000, replace=TRUE)
table(fair_coin)

## fair_coin
## heads tails 
##   512   488

unfair_coin <- sample(coins, size = 1000, replace=TRUE, prob =c(0.3,0.7))
table(unfair_coin)

## unfair_coin
## heads tails 
##   297   703

dices <- 1:6
outcomes <- sample(dices, 10000, replace=TRUE)
tb <- table(outcomes)
barplot(tb)

outcomes2 <- sample(dices, 10000, replace=TRUE, prob = c(0.1, 0.1,0.1,0.1,0.1,0.5 ))
tb2 <- table(outcomes2)
barplot(tb2)

Normal Distribution

par(mfrow=c(1,1))
load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)

##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f

hist(cdc$height, breaks = 50)

?rnorm
a <- rnorm(100, mean = 0 , sd = 1)
b <- rnorm(100, mean = 0 , sd = 3)
par(mfrow = c(2,1))
hist(a, xlim = c(-10,10))
hist(b, xlim = c(-10,10))

?hist


curve(dnorm ,-3,3)
dnorm(0)

## [1] 0.3989423

dnorm(0,mean =0,sd =5)

## [1] 0.07978846

curve(pnorm (x), -3,3)

pnorm(0)

## [1] 0.5

curve(dnorm ,-3,3)
abline(v = 0, col='red')

pnorm(1)

## [1] 0.8413447

curve(dnorm ,-3,3)
abline(v = 1, col='red')

pnorm(1)

## [1] 0.8413447

pnorm(-1)

## [1] 0.1586553

pnorm(1, lower.tail = FALSE)

## [1] 0.1586553

pnorm(1) - pnorm(-1)

## [1] 0.6826895

pnorm(2) - pnorm(-2)

## [1] 0.9544997

pnorm(3) - pnorm(-3)

## [1] 0.9973002

mu <- mean(cdc$height)
s  <- sd(cdc$height)

mu

## [1] 67.1829

mu - s

## [1] 63.05695

mu + s

## [1] 71.30885

length(cdc$height[(cdc$height >= mu - s) & (cdc$height <= mu + s)]) / length(cdc$height)

## [1] 0.62125

cdc$height[cdc$height > mu + 3 * s]

##  [1] 80 81 83 80 80 80 80 82 81 82 80 81 80 80 93 80 80 84

set.seed(50)
?runif
y <- runif(10000,0,5)
hist(y)

dices <- 1:6

barplot(table(sample(dices, 1000, replace=TRUE)) )

hist(rpois(5000 ,3))

set.seed(123)
sample_height <- sample(cdc$height, 5000 )
hist(sample_height)

?shapiro.test
shapiro.test(sample_height)

## 
##  Shapiro-Wilk normality test
## 
## data:  sample_height
## W = 0.98679, p-value < 2.2e-16

shapiro.test(rnorm(100, mean = 5, sd = 3))

## 
##  Shapiro-Wilk normality test
## 
## data:  rnorm(100, mean = 5, sd = 3)
## W = 0.98397, p-value = 0.2673

shapiro.test(runif(100, min = 2, max = 4))

## 
##  Shapiro-Wilk normality test
## 
## data:  runif(100, min = 2, max = 4)
## W = 0.95522, p-value = 0.001862

ks.test(rnorm(100), 'pnorm')

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rnorm(100)
## D = 0.096679, p-value = 0.3073
## alternative hypothesis: two-sided

Sampling Distribution

hist(cdc$height, breaks = 50)

?rep
rep(0, 5)

## [1] 0 0 0 0 0

rep(c(0,1), 5)

##  [1] 0 1 0 1 0 1 0 1 0 1

# Bad method
a <- c()
for (i in 1:10){
  a <- c(a, i)
}
a

##  [1]  1  2  3  4  5  6  7  8  9 10

# Good Method
a <- rep(NA, 10)
for (i in 1:10){
  a[i] <- i 
}
a

##  [1]  1  2  3  4  5  6  7  8  9 10

sample_mean10  <- rep(NA , 5000)
sample_mean50  <- rep(NA , 5000)
sample_mean100 <- rep(NA , 5000)

for(i in 1:5000){
  samp             <- sample(cdc$height, 10)
  sample_mean10[i] <- mean(samp)
  
  samp             <- sample(cdc$height, 50)
  sample_mean50[i] <- mean(samp)
  
  samp             <- sample(cdc$height, 100)
  sample_mean100[i] <- mean(samp)
}

sample_mean10 <- sapply(1:5000, function(e) mean(sample(cdc$height,10)) )

sample_mean50 <- sapply(1:5000, function(e) mean(sample(cdc$height,50)) )

sample_mean100 <- sapply(1:5000, function(e) mean(sample(cdc$height,100)) )

sample_mean1000 <- sapply(1:5000, function(e) mean(sample(cdc$height,1000)) )

mu10 <- mean(sample_mean10)
sd10 <- sd(sample_mean10)

mu10 - sd10 * 3

## [1] 63.2492

mu10 + sd10 * 3

## [1] 71.09008

mu50 <- mean(sample_mean50)
sd50 <- sd(sample_mean50)

mu50 - sd50 * 3

## [1] 65.44243

mu50 + sd50 * 3

## [1] 68.93508

mu100 <- mean(sample_mean100)
sd100 <- sd(sample_mean100)

mu100 - sd100 * 3

## [1] 65.95618

mu100 + sd100 * 3

## [1] 68.41622

mu1000 <- mean(sample_mean1000)
sd1000 <- sd(sample_mean1000)

mu1000 - sd1000 * 3

## [1] 66.80043

mu1000 + sd1000 * 3

## [1] 67.56821

height_mu <- mean(cdc$height)

xlimits <- range(sample_mean10)

par (mfrow =c(4, 1))
hist(sample_mean10, breaks = 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean50, breaks = 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean100, breaks= 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean1000, breaks= 20, xlim = xlimits)
abline(v = height_mu, col='red')

height_mu

## [1] 67.1829

mean(sample(cdc$height, 10))

## [1] 65.5

Demo20180523

David Chiu

2018年5月23日

Probability

Normal Distribution

Sampling Distribution