1 Probability Distributions

1.1 Normal Distribution

rnorm(n = 10)
##  [1] -0.1248288  1.0246240 -0.2975271 -0.4425841  1.7625705  0.4192821
##  [7] -0.1952010 -0.1591306 -0.2250104 -0.2334090
rnorm(n = 10, mean = 100, sd = 20)
##  [1] 102.02487 116.88542 106.63719  94.84955 109.14181 121.75903 102.87836
##  [8] 136.92520  97.18729  73.01545
x <- rnorm(10)

x
##  [1]  0.5869852  0.8233215  3.3438364 -0.3278451 -1.3953505 -0.6437764
##  [7] -0.3908015  0.6890921 -0.1996074  1.8668981
dnorm(x = x)
##  [1] 0.335808453 0.284259505 0.001489136 0.378068557 0.150703635
##  [6] 0.324275259 0.369612015 0.314628563 0.391073369 0.069836901
dnorm(x = c(-1, 0, 1))
## [1] 0.2419707 0.3989423 0.2419707
library(ggplot2)

x <- rnorm(30000)

y <- dnorm(x = x)

z = data.frame(RandomNormalNumbers = x, RandomNormalDensity = y)

head(z)
##   RandomNormalNumbers RandomNormalDensity
## 1          -0.7093424          0.31020503
## 2           0.7538619          0.30026422
## 3          -1.3899864          0.15183368
## 4          -0.4213337          0.36505781
## 5           1.6930575          0.09516335
## 6           0.5497466          0.34299165
g <- ggplot(data = z, mapping = aes(x = RandomNormalNumbers, y = RandomNormalDensity))

g + geom_point() + labs(x = "Random Normal Numbers", y = "Random Normal Density")
head(pnorm(q = y))
## [1] 0.6217975 0.6180122 0.5603409 0.6424659 0.5379075 0.6341976
pnorm(q = c(-3, 0, 3))
## [1] 0.001349898 0.500000000 0.998650102
pnorm(1) - pnorm(0)
## [1] 0.3413447
head(x) 
## [1] -0.7093424  0.7538619 -1.3899864 -0.4213337  1.6930575  0.5497466
head(pnorm(x))
## [1] 0.23905603 0.77453394 0.08226651 0.33675573 0.95477774 0.70875339
head(qnorm(pnorm(x)))
## [1] -0.7093424  0.7538619 -1.3899864 -0.4213337  1.6930575  0.5497466
head(x)
## [1] -0.7093424  0.7538619 -1.3899864 -0.4213337  1.6930575  0.5497466

1.2 Binomial Distribution

rbinom(n = 1, size = 10, prob = 0.4)
## [1] 2
rbinom(n = 5, size = 10, prob = 0.4)
## [1] 4 5 3 7 4
dbinom(x = 3, size = 10, prob = 0.3)
## [1] 0.2668279
pbinom(q = 3, size = 10, prob = 0.3)
## [1] 0.6496107
qbinom(p = 0.3, size = 10, prob = 0.3)
## [1] 2
qbinom(p = c(0.3, 0.35, 0.4), size = 10, prob = 0.4)
## [1] 3 3 4

1.2.1 Bernoulli Distribution

rbinom(n = 10, size = 1, prob = 0.5)
##  [1] 1 1 0 0 0 1 0 0 1 0
x <- data.frame(Successes = rbinom(n = 1000, size = 10, prob = 0.3))

1.2.2 Approximating to Normal Distribution

ggplot(x, aes(Successes)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
y <- data.frame(Successes = rbinom(n = 1000, size = 5, prob = 0.2), Size = 5)

head(y)
##   Successes Size
## 1         2    5
## 2         0    5
## 3         1    5
## 4         0    5
## 5         1    5
## 6         2    5
z1 <- data.frame(Successes = rbinom(n = 1000, size = 10, prob = 0.2), Size = 10)

head(z1)
##   Successes Size
## 1         6   10
## 2         5   10
## 3         2   10
## 4         1   10
## 5         3   10
## 6         1   10
z2 <- data.frame(Successes = rbinom(n = 1000, size = 100, prob = 0.2), Size = 100)

head(z2)
##   Successes Size
## 1        14  100
## 2        10  100
## 3        17  100
## 4        23  100
## 5        21  100
## 6        12  100
z3 <- data.frame(Successes = rbinom(n = 1000, size = 100, prob = 0.2), Size = 1000)

head(z3)
##   Successes Size
## 1        19 1000
## 2        21 1000
## 3        24 1000
## 4        20 1000
## 5        23 1000
## 6        21 1000
z4 <- rbind(Binom10 = z1, Binom100 = z2, Binom1000 = z3)

head(z4)
##           Successes Size
## Binom10.1         6   10
## Binom10.2         5   10
## Binom10.3         2   10
## Binom10.4         1   10
## Binom10.5         3   10
## Binom10.6         1   10
tail(z4)
##                Successes Size
## Binom1000.995         15 1000
## Binom1000.996         22 1000
## Binom1000.997         13 1000
## Binom1000.998         17 1000
## Binom1000.999         15 1000
## Binom1000.1000        21 1000
ggplot(z4, aes(x = Successes)) + geom_histogram() + facet_wrap(facets = ~ Size, scales = "free")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.

1.3 Poisson Distribution

p1 <- rpois(n = 1000, lambda = 1)

head(p1)
## [1] 1 2 1 1 3 1
p2 <- rpois(n = 1000, lambda = 2)

head(p2)
## [1] 0 0 1 1 0 3
p5 <- rpois(n = 1000, lambda = 5)

head(p2)
## [1] 0 0 1 1 0 3
p10 <- rpois(n = 1000, lambda = 10)

head(p10)
## [1] 10 13  8 10 14  7
p20 <- rpois(n = 1000, lambda = 20)

head(p20)
## [1] 13 18 23 15 18 23
pois <- data.frame(lambda1 = p1, lambda2 = p2, lambda5 = p5, lambda10 = p10, lambda20 = p20)

head(pois)
##   lambda1 lambda2 lambda5 lambda10 lambda20
## 1       1       0       3       10       13
## 2       2       0       2       13       18
## 3       1       1       7        8       23
## 4       1       1       6       10       15
## 5       3       0       3       14       18
## 6       1       3       3        7       23
library(reshape2)

pois <- melt(data = pois, variable.name = "lambda", value.name = "x")
## No id variables; using all as measure variables
head(pois)
##    lambda x
## 1 lambda1 1
## 2 lambda1 2
## 3 lambda1 1
## 4 lambda1 1
## 5 lambda1 3
## 6 lambda1 1
tail(pois)
##        lambda  x
## 4995 lambda20 26
## 4996 lambda20 21
## 4997 lambda20 20
## 4998 lambda20 18
## 4999 lambda20 16
## 5000 lambda20 17
library(stringr)

pois$lambda <- str_extract(string = pois$lambda, pattern = "\\d+")

head(pois)
##   lambda x
## 1      1 1
## 2      1 2
## 3      1 1
## 4      1 1
## 5      1 3
## 6      1 1
class(pois$lambda)
## [1] "character"
pois$lambda <- as.factor(as.numeric(pois$lambda))

tail(pois)
##      lambda  x
## 4995     20 26
## 4996     20 21
## 4997     20 20
## 4998     20 18
## 4999     20 16
## 5000     20 17
ggplot(pois, aes(x = x)) + geom_density(mapping = aes(group = lambda, color = lambda, fill = lambda), adjust = 4, alpha = 1/2) + scale_color_discrete() + scale_fill_discrete() + ggtitle("Probability Mass Function")

2 Basic Statistics

2.1 Mean

x <- sample(x = 1:100, size = 100, replace = TRUE)

x
##   [1] 57 55 69 70 51 60  8  5 96 58 41 88 59 99 83 76 88 83 40  2 19 41 55
##  [24] 74 35 94 33 21 80 38 54 94 44 71 37 85 67 72 44 18 30 37 31 86 87 21
##  [47] 78 52 31 78  6 13 49 77 62  1 33 31 93 40 72 70 12 17 31 77 85 61  6
##  [70] 19  2 71 61 16  3 69 21  6 48 58 58 23 61 80 93 25  7  3 21 11 72 51
##  [93] 41 15  5 94 30 91 51 17
mean(x)
## [1] 48.54
y <- x

y[sample(1:100, 20, replace = FALSE)] <- NA 

y
##   [1] 57 55 69 70 NA 60  8  5 NA 58 41 NA 59 99 83 76 88 NA 40  2 NA NA 55
##  [24] 74 35 94 33 21 NA 38 54 94 44 NA 37 85 NA 72 44 18 30 37 31 86 87 21
##  [47] NA 52 31 78  6 13 49 NA 62  1 33 NA 93 40 72 NA NA 17 31 77 85 NA  6
##  [70] 19 NA 71 61 16  3 69 21  6 48 58 58 23 61 80 NA 25  7  3 21 11 72 51
##  [93] 41 NA NA 94 30 NA 51 17
mean(y)
## [1] NA
mean(y, na.rm = TRUE)
## [1] 46.5375
z <- c(95, 72, 23, 34)
w <- c(1/2, 1/4, 1/8, 1/8)

mean(z)
## [1] 56
weighted.mean(x = z, w = w)
## [1] 72.625

2.2 Variance and Standard Deviation

var(x)
## [1] 832.3317
mean(x)
## [1] 48.54
sum((x - mean(x)) ** 2) / (length(x) - 1)
## [1] 832.3317
sqrt(sum((x - mean(x)) ** 2) / (length(x) - 1))
## [1] 28.85016
sd(x)
## [1] 28.85016
sd(y)
## [1] NA
sd(y, na.rm = TRUE)
## [1] 27.88082

2.3 Common Functions

sum(x)
## [1] 4854
min(x)
## [1] 1
max(x)
## [1] 99
sum(y)
## [1] NA
sum(y, na.rm = TRUE)
## [1] 3723
summary(x)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   21.00   51.00   48.54   72.00   99.00
summary(y)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   21.00   46.00   46.54   70.25   99.00      20
quantile(x = x, probs = c(0.25, 0.75))
## 25% 75% 
##  21  72
quantile(x, seq(0, 1, length = 10))
##        0% 11.11111% 22.22222% 33.33333% 44.44444% 55.55556% 66.66667% 
##         1         8        21        31        41        55        67 
## 77.77778% 88.88889%      100% 
##        76        86        99

2.4 Correlation

library(ggplot2)

data("economics")

head(economics)
##         date   pce    pop psavert uempmed unemploy
## 1 1967-06-30 507.8 198712     9.8     4.5     2944
## 2 1967-07-31 510.9 198911     9.8     4.7     2945
## 3 1967-08-31 516.7 199113     9.0     4.6     2958
## 4 1967-09-30 513.3 199311     9.8     4.9     3143
## 5 1967-10-31 518.5 199498     9.7     4.7     3066
## 6 1967-11-30 526.2 199657     9.4     4.8     3018
cor(economics$pce, economics$psavert)
## [1] -0.9271222
cor(economics[, 4:6])
##              psavert    uempmed    unemploy
## psavert   1.00000000 -0.3615301 -0.07641651
## uempmed  -0.36153012  1.0000000  0.78427918
## unemploy -0.07641651  0.7842792  1.00000000
x <- cor(economics[, 4:6])
 
library(reshape2)

y <- melt(data = x, varnames = c("x", "y"), value.name = "Correlation")

y
##          x        y Correlation
## 1  psavert  psavert  1.00000000
## 2  uempmed  psavert -0.36153012
## 3 unemploy  psavert -0.07641651
## 4  psavert  uempmed -0.36153012
## 5  uempmed  uempmed  1.00000000
## 6 unemploy  uempmed  0.78427918
## 7  psavert unemploy -0.07641651
## 8  uempmed unemploy  0.78427918
## 9 unemploy unemploy  1.00000000
y <- y[order(y$Correlation), ]

y
##          x        y Correlation
## 2  uempmed  psavert -0.36153012
## 4  psavert  uempmed -0.36153012
## 3 unemploy  psavert -0.07641651
## 7  psavert unemploy -0.07641651
## 6 unemploy  uempmed  0.78427918
## 8  uempmed unemploy  0.78427918
## 1  psavert  psavert  1.00000000
## 5  uempmed  uempmed  1.00000000
## 9 unemploy unemploy  1.00000000
ggplot(data = y, aes(x, y)) + geom_tile(aes(fill = Correlation)) + scale_fill_gradient2(high = "steelblue", guide = guide_colorbar(ticks = FALSE, barheight = 10, limits = c(-1, 1))) + theme_minimal() + labs(x = NULL, NULL)
cor(economics[sapply(economics, is.numeric)], use = "everything")
##                 pce        pop     psavert    uempmed    unemploy
## pce       1.0000000  0.9879665 -0.92712221  0.5145862  0.32441514
## pop       0.9879665  1.0000000 -0.89086571  0.5494114  0.41227726
## psavert  -0.9271222 -0.8908657  1.00000000 -0.3615301 -0.07641651
## uempmed   0.5145862  0.5494114 -0.36153012  1.0000000  0.78427918
## unemploy  0.3244151  0.4122773 -0.07641651  0.7842792  1.00000000