Probability

?sample
## starting httpd help server ... done
1:10
##  [1]  1  2  3  4  5  6  7  8  9 10
sample(1:10)
##  [1]  7  4  3  5  9  1  6 10  8  2
sample(1:10, size = 5)
## [1]  2 10  4  9  6
sample(1:10, size = 5, replace=TRUE)
## [1]  3  1 10  7  7
#sample(c(0,1), size = 10)
sample(c(0,1), size = 10, replace=TRUE)
##  [1] 0 0 1 1 1 1 0 0 1 1
sample(c(0,1), size = 10, replace=TRUE, prob = c(0.7,0.3))
##  [1] 0 0 0 0 0 0 0 0 0 1
dataset <- c(90,80,55,66,77,88,99,53,87,95)
length(dataset)
## [1] 10
idx     <- c(1,1,1,1,1,1,1,0,0,0)
dataset[idx == 1]
## [1] 90 80 55 66 77 88 99
dataset[idx == 0]
## [1] 53 87 95
idx <- sample(c(0,1), size = 10, replace=TRUE, prob = c(0.3,0.7))
idx
##  [1] 1 0 1 1 1 1 1 1 0 1
dataset[idx == 1]
## [1] 90 55 66 77 88 99 53 95
dataset[idx == 0]
## [1] 80 87
sample(1:42, size= 6)
## [1]  8 40 42 11 18 35
set.seed(123)
sample(1:42, size= 6)
## [1] 13 33 17 35 36  2
sample(1:42, size= 6)
## [1] 23 37 42 18 41 17
sample(1:42, size= 6)
## [1] 29 24  5 36 10  2
head(.Random.seed)
## [1]        403         18  515190382 2133433928  917665867 1283494313
?sample.int

sample.int(42,6)
## [1] 14 40 36 28 25 37
coins     <- c('heads', 'tails')
fair_coin <- sample(coins, size = 1000, replace=TRUE)
table(fair_coin)
## fair_coin
## heads tails 
##   512   488
unfair_coin <- sample(coins, size = 1000, replace=TRUE, prob =c(0.3,0.7))
table(unfair_coin)
## unfair_coin
## heads tails 
##   297   703
dices <- 1:6
outcomes <- sample(dices, 10000, replace=TRUE)
tb <- table(outcomes)
barplot(tb)

outcomes2 <- sample(dices, 10000, replace=TRUE, prob = c(0.1, 0.1,0.1,0.1,0.1,0.5 ))
tb2 <- table(outcomes2)
barplot(tb2)

Normal Distribution

par(mfrow=c(1,1))
load("C:/Users/nc20/Downloads/cdc.Rdata")
head(cdc)
##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f
hist(cdc$height, breaks = 50)

?rnorm
a <- rnorm(100, mean = 0 , sd = 1)
b <- rnorm(100, mean = 0 , sd = 3)
par(mfrow = c(2,1))
hist(a, xlim = c(-10,10))
hist(b, xlim = c(-10,10))

?hist


curve(dnorm ,-3,3)
dnorm(0)
## [1] 0.3989423
dnorm(0,mean =0,sd =5)
## [1] 0.07978846
curve(pnorm (x), -3,3)

pnorm(0)
## [1] 0.5
curve(dnorm ,-3,3)
abline(v = 0, col='red')

pnorm(1)
## [1] 0.8413447
curve(dnorm ,-3,3)
abline(v = 1, col='red')

pnorm(1)
## [1] 0.8413447
pnorm(-1)
## [1] 0.1586553
pnorm(1, lower.tail = FALSE)
## [1] 0.1586553
pnorm(1) - pnorm(-1)
## [1] 0.6826895
pnorm(2) - pnorm(-2)
## [1] 0.9544997
pnorm(3) - pnorm(-3)
## [1] 0.9973002
mu <- mean(cdc$height)
s  <- sd(cdc$height)

mu
## [1] 67.1829
mu - s
## [1] 63.05695
mu + s
## [1] 71.30885
length(cdc$height[(cdc$height >= mu - s) & (cdc$height <= mu + s)]) / length(cdc$height)
## [1] 0.62125
cdc$height[cdc$height > mu + 3 * s]
##  [1] 80 81 83 80 80 80 80 82 81 82 80 81 80 80 93 80 80 84
set.seed(50)
?runif
y <- runif(10000,0,5)
hist(y)

dices <- 1:6

barplot(table(sample(dices, 1000, replace=TRUE)) )

hist(rpois(5000 ,3))

set.seed(123)
sample_height <- sample(cdc$height, 5000 )
hist(sample_height)

?shapiro.test
shapiro.test(sample_height)
## 
##  Shapiro-Wilk normality test
## 
## data:  sample_height
## W = 0.98679, p-value < 2.2e-16
shapiro.test(rnorm(100, mean = 5, sd = 3))
## 
##  Shapiro-Wilk normality test
## 
## data:  rnorm(100, mean = 5, sd = 3)
## W = 0.98397, p-value = 0.2673
shapiro.test(runif(100, min = 2, max = 4))
## 
##  Shapiro-Wilk normality test
## 
## data:  runif(100, min = 2, max = 4)
## W = 0.95522, p-value = 0.001862
ks.test(rnorm(100), 'pnorm')
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rnorm(100)
## D = 0.096679, p-value = 0.3073
## alternative hypothesis: two-sided

Sampling Distribution

hist(cdc$height, breaks = 50)

?rep
rep(0, 5)
## [1] 0 0 0 0 0
rep(c(0,1), 5)
##  [1] 0 1 0 1 0 1 0 1 0 1
# Bad method
a <- c()
for (i in 1:10){
  a <- c(a, i)
}
a
##  [1]  1  2  3  4  5  6  7  8  9 10
# Good Method
a <- rep(NA, 10)
for (i in 1:10){
  a[i] <- i 
}
a
##  [1]  1  2  3  4  5  6  7  8  9 10
sample_mean10  <- rep(NA , 5000)
sample_mean50  <- rep(NA , 5000)
sample_mean100 <- rep(NA , 5000)

for(i in 1:5000){
  samp             <- sample(cdc$height, 10)
  sample_mean10[i] <- mean(samp)
  
  samp             <- sample(cdc$height, 50)
  sample_mean50[i] <- mean(samp)
  
  samp             <- sample(cdc$height, 100)
  sample_mean100[i] <- mean(samp)
}

sample_mean10 <- sapply(1:5000, function(e) mean(sample(cdc$height,10)) )

sample_mean50 <- sapply(1:5000, function(e) mean(sample(cdc$height,50)) )

sample_mean100 <- sapply(1:5000, function(e) mean(sample(cdc$height,100)) )

sample_mean1000 <- sapply(1:5000, function(e) mean(sample(cdc$height,1000)) )

mu10 <- mean(sample_mean10)
sd10 <- sd(sample_mean10)

mu10 - sd10 * 3
## [1] 63.2492
mu10 + sd10 * 3
## [1] 71.09008
mu50 <- mean(sample_mean50)
sd50 <- sd(sample_mean50)

mu50 - sd50 * 3
## [1] 65.44243
mu50 + sd50 * 3
## [1] 68.93508
mu100 <- mean(sample_mean100)
sd100 <- sd(sample_mean100)

mu100 - sd100 * 3
## [1] 65.95618
mu100 + sd100 * 3
## [1] 68.41622
mu1000 <- mean(sample_mean1000)
sd1000 <- sd(sample_mean1000)

mu1000 - sd1000 * 3
## [1] 66.80043
mu1000 + sd1000 * 3
## [1] 67.56821
height_mu <- mean(cdc$height)

xlimits <- range(sample_mean10)

par (mfrow =c(4, 1))
hist(sample_mean10, breaks = 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean50, breaks = 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean100, breaks= 20, xlim = xlimits)
abline(v = height_mu, col='red')
hist(sample_mean1000, breaks= 20, xlim = xlimits)
abline(v = height_mu, col='red')

height_mu
## [1] 67.1829
mean(sample(cdc$height, 10))
## [1] 65.5