Probability
?sample
## starting httpd help server ... done
1:10
## [1] 1 2 3 4 5 6 7 8 9 10
sample(1:10)
## [1] 7 4 3 5 9 1 6 10 8 2
sample(1:10, size = 5)
## [1] 2 10 4 9 6
sample(1:10, size = 5, replace=TRUE)
## [1] 3 1 10 7 7
#sample(c(0,1), size = 10)
sample(c(0,1), size = 10, replace=TRUE)
## [1] 0 0 1 1 1 1 0 0 1 1
sample(c(0,1), size = 10, replace=TRUE, prob = c(0.7,0.3))
## [1] 0 0 0 0 0 0 0 0 0 1
dataset <- c(90,80,55,66,77,88,99,53,87,95)
length(dataset)
## [1] 10
idx <- c(1,1,1,1,1,1,1,0,0,0)
dataset[idx == 1]
## [1] 90 80 55 66 77 88 99
dataset[idx == 0]
## [1] 53 87 95
idx <- sample(c(0,1), size = 10, replace=TRUE, prob = c(0.3,0.7))
idx
## [1] 1 0 1 1 1 1 1 1 0 1
dataset[idx == 1]
## [1] 90 55 66 77 88 99 53 95
dataset[idx == 0]
## [1] 80 87
sample(1:42, size= 6)
## [1] 8 40 42 11 18 35
set.seed(123)
sample(1:42, size= 6)
## [1] 13 33 17 35 36 2
sample(1:42, size= 6)
## [1] 23 37 42 18 41 17
sample(1:42, size= 6)
## [1] 29 24 5 36 10 2
head(.Random.seed)
## [1] 403 18 515190382 2133433928 917665867 1283494313
?sample.int
sample.int(42,6)
## [1] 14 40 36 28 25 37
coins <- c('heads', 'tails')
fair_coin <- sample(coins, size = 1000, replace=TRUE)
table(fair_coin)
## fair_coin
## heads tails
## 512 488
unfair_coin <- sample(coins, size = 1000, replace=TRUE, prob =c(0.3,0.7))
table(unfair_coin)
## unfair_coin
## heads tails
## 297 703
dices <- 1:6
outcomes <- sample(dices, 10000, replace=TRUE)
tb <- table(outcomes)
barplot(tb)

outcomes2 <- sample(dices, 10000, replace=TRUE, prob = c(0.1, 0.1,0.1,0.1,0.1,0.5 ))
tb2 <- table(outcomes2)
barplot(tb2)

Normal Distribution
par(mfrow=c(1,1))
load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
## genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1 good 0 1 0 70 175 175 77 m
## 2 good 0 1 1 64 125 115 33 f
## 3 good 1 1 1 60 105 105 49 f
## 4 good 1 1 0 66 132 124 42 f
## 5 very good 0 1 0 61 150 130 55 f
## 6 very good 1 1 0 64 114 114 55 f
hist(cdc$height, breaks = 50)

?rnorm
a <- rnorm(100, mean = 0 , sd = 1)
b <- rnorm(100, mean = 0 , sd = 3)
par(mfrow = c(2,1))
hist(a, xlim = c(-10,10))
hist(b, xlim = c(-10,10))

?hist
curve(dnorm ,-3,3)
dnorm(0)
## [1] 0.3989423
dnorm(0,mean =0,sd =5)
## [1] 0.07978846
curve(pnorm (x), -3,3)

pnorm(0)
## [1] 0.5
curve(dnorm ,-3,3)
abline(v = 0, col='red')
pnorm(1)
## [1] 0.8413447
curve(dnorm ,-3,3)
abline(v = 1, col='red')

pnorm(1)
## [1] 0.8413447
pnorm(-1)
## [1] 0.1586553
pnorm(1, lower.tail = FALSE)
## [1] 0.1586553
pnorm(1) - pnorm(-1)
## [1] 0.6826895
pnorm(2) - pnorm(-2)
## [1] 0.9544997
pnorm(3) - pnorm(-3)
## [1] 0.9973002
mu <- mean(cdc$height)
s <- sd(cdc$height)
mu
## [1] 67.1829
mu - s
## [1] 63.05695
mu + s
## [1] 71.30885
length(cdc$height[(cdc$height >= mu - s) & (cdc$height <= mu + s)]) / length(cdc$height)
## [1] 0.62125
cdc$height[cdc$height > mu + 3 * s]
## [1] 80 81 83 80 80 80 80 82 81 82 80 81 80 80 93 80 80 84
set.seed(50)
?runif
y <- runif(10000,0,5)
hist(y)
dices <- 1:6
barplot(table(sample(dices, 1000, replace=TRUE)) )

hist(rpois(5000 ,3))
set.seed(123)
sample_height <- sample(cdc$height, 5000 )
hist(sample_height)

?shapiro.test
shapiro.test(sample_height)
##
## Shapiro-Wilk normality test
##
## data: sample_height
## W = 0.98679, p-value < 2.2e-16
shapiro.test(rnorm(100, mean = 5, sd = 3))
##
## Shapiro-Wilk normality test
##
## data: rnorm(100, mean = 5, sd = 3)
## W = 0.98397, p-value = 0.2673
shapiro.test(runif(100, min = 2, max = 4))
##
## Shapiro-Wilk normality test
##
## data: runif(100, min = 2, max = 4)
## W = 0.95522, p-value = 0.001862
ks.test(rnorm(100), 'pnorm')
##
## One-sample Kolmogorov-Smirnov test
##
## data: rnorm(100)
## D = 0.096679, p-value = 0.3073
## alternative hypothesis: two-sided
Sampling Distribution
hist(cdc$height, breaks = 50)

?rep
rep(0, 5)
## [1] 0 0 0 0 0
rep(c(0,1), 5)
## [1] 0 1 0 1 0 1 0 1 0 1
# Bad method
a <- c()
for (i in 1:10){
a <- c(a, i)
}
a
## [1] 1 2 3 4 5 6 7 8 9 10
# Good Method
a <- rep(NA, 10)
for (i in 1:10){
a[i] <- i
}
a
## [1] 1 2 3 4 5 6 7 8 9 10
sample_mean10 <- rep(NA , 5000)
sample_mean50 <- rep(NA , 5000)
sample_mean100 <- rep(NA , 5000)
for(i in 1:5000){
samp <- sample(cdc$height, 10)
sample_mean10[i] <- mean(samp)
samp <- sample(cdc$height, 50)
sample_mean50[i] <- mean(samp)
samp <- sample(cdc$height, 100)
sample_mean100[i] <- mean(samp)
}
sample_mean10 <- sapply(1:5000, function(e) mean(sample(cdc$height,10)) )
sample_mean50 <- sapply(1:5000, function(e) mean(sample(cdc$height,50)) )
sample_mean100 <- sapply(1:5000, function(e) mean(sample(cdc$height,100)) )
sample_mean1000 <- sapply(1:5000, function(e) mean(sample(cdc$height,1000)) )
mu10 <- mean(sample_mean10)
sd10 <- sd(sample_mean10)
mu10 - sd10 * 3
## [1] 63.2492
mu10 + sd10 * 3
## [1] 71.09008
mu50 <- mean(sample_mean50)
sd50 <- sd(sample_mean50)
mu50 - sd50 * 3
## [1] 65.44243
mu50 + sd50 * 3
## [1] 68.93508
mu100 <- mean(sample_mean100)
sd100 <- sd(sample_mean100)
mu100 - sd100 * 3
## [1] 65.95618
mu100 + sd100 * 3
## [1] 68.41622
mu1000 <- mean(sample_mean1000)
sd1000 <- sd(sample_mean1000)
mu1000 - sd1000 * 3
## [1] 66.80043
mu1000 + sd1000 * 3
## [1] 67.56821
height_mu <- mean(cdc$height)
xlimits <- range(sample_mean10)
par (mfrow =c(4, 1))
hist(sample_mean10, breaks = 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean50, breaks = 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean100, breaks= 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean1000, breaks= 20, xlim = xlimits)
abline(v = height_mu, col='red')

height_mu
## [1] 67.1829
mean(sample(cdc$height, 10))
## [1] 65.5