Probability Distributions
Normal Distribution
rnorm(n = 10)
## [1] -0.1248288 1.0246240 -0.2975271 -0.4425841 1.7625705 0.4192821
## [7] -0.1952010 -0.1591306 -0.2250104 -0.2334090
rnorm(n = 10, mean = 100, sd = 20)
## [1] 102.02487 116.88542 106.63719 94.84955 109.14181 121.75903 102.87836
## [8] 136.92520 97.18729 73.01545
x <- rnorm(10)
x
## [1] 0.5869852 0.8233215 3.3438364 -0.3278451 -1.3953505 -0.6437764
## [7] -0.3908015 0.6890921 -0.1996074 1.8668981
dnorm(x = x)
## [1] 0.335808453 0.284259505 0.001489136 0.378068557 0.150703635
## [6] 0.324275259 0.369612015 0.314628563 0.391073369 0.069836901
dnorm(x = c(-1, 0, 1))
## [1] 0.2419707 0.3989423 0.2419707
library(ggplot2)
x <- rnorm(30000)
y <- dnorm(x = x)
z = data.frame(RandomNormalNumbers = x, RandomNormalDensity = y)
head(z)
## RandomNormalNumbers RandomNormalDensity
## 1 -0.7093424 0.31020503
## 2 0.7538619 0.30026422
## 3 -1.3899864 0.15183368
## 4 -0.4213337 0.36505781
## 5 1.6930575 0.09516335
## 6 0.5497466 0.34299165
g <- ggplot(data = z, mapping = aes(x = RandomNormalNumbers, y = RandomNormalDensity))
g + geom_point() + labs(x = "Random Normal Numbers", y = "Random Normal Density")
head(pnorm(q = y))
## [1] 0.6217975 0.6180122 0.5603409 0.6424659 0.5379075 0.6341976
pnorm(q = c(-3, 0, 3))
## [1] 0.001349898 0.500000000 0.998650102
pnorm(1) - pnorm(0)
## [1] 0.3413447
head(x)
## [1] -0.7093424 0.7538619 -1.3899864 -0.4213337 1.6930575 0.5497466
head(pnorm(x))
## [1] 0.23905603 0.77453394 0.08226651 0.33675573 0.95477774 0.70875339
head(qnorm(pnorm(x)))
## [1] -0.7093424 0.7538619 -1.3899864 -0.4213337 1.6930575 0.5497466
head(x)
## [1] -0.7093424 0.7538619 -1.3899864 -0.4213337 1.6930575 0.5497466
Binomial Distribution
rbinom(n = 1, size = 10, prob = 0.4)
## [1] 2
rbinom(n = 5, size = 10, prob = 0.4)
## [1] 4 5 3 7 4
dbinom(x = 3, size = 10, prob = 0.3)
## [1] 0.2668279
pbinom(q = 3, size = 10, prob = 0.3)
## [1] 0.6496107
qbinom(p = 0.3, size = 10, prob = 0.3)
## [1] 2
qbinom(p = c(0.3, 0.35, 0.4), size = 10, prob = 0.4)
## [1] 3 3 4
Bernoulli Distribution
rbinom(n = 10, size = 1, prob = 0.5)
## [1] 1 1 0 0 0 1 0 0 1 0
x <- data.frame(Successes = rbinom(n = 1000, size = 10, prob = 0.3))
Approximating to Normal Distribution
ggplot(x, aes(Successes)) + geom_histogram()
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
y <- data.frame(Successes = rbinom(n = 1000, size = 5, prob = 0.2), Size = 5)
head(y)
## Successes Size
## 1 2 5
## 2 0 5
## 3 1 5
## 4 0 5
## 5 1 5
## 6 2 5
z1 <- data.frame(Successes = rbinom(n = 1000, size = 10, prob = 0.2), Size = 10)
head(z1)
## Successes Size
## 1 6 10
## 2 5 10
## 3 2 10
## 4 1 10
## 5 3 10
## 6 1 10
z2 <- data.frame(Successes = rbinom(n = 1000, size = 100, prob = 0.2), Size = 100)
head(z2)
## Successes Size
## 1 14 100
## 2 10 100
## 3 17 100
## 4 23 100
## 5 21 100
## 6 12 100
z3 <- data.frame(Successes = rbinom(n = 1000, size = 100, prob = 0.2), Size = 1000)
head(z3)
## Successes Size
## 1 19 1000
## 2 21 1000
## 3 24 1000
## 4 20 1000
## 5 23 1000
## 6 21 1000
z4 <- rbind(Binom10 = z1, Binom100 = z2, Binom1000 = z3)
head(z4)
## Successes Size
## Binom10.1 6 10
## Binom10.2 5 10
## Binom10.3 2 10
## Binom10.4 1 10
## Binom10.5 3 10
## Binom10.6 1 10
tail(z4)
## Successes Size
## Binom1000.995 15 1000
## Binom1000.996 22 1000
## Binom1000.997 13 1000
## Binom1000.998 17 1000
## Binom1000.999 15 1000
## Binom1000.1000 21 1000
ggplot(z4, aes(x = Successes)) + geom_histogram() + facet_wrap(facets = ~ Size, scales = "free")
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
## stat_bin: binwidth defaulted to range/30. Use 'binwidth = x' to adjust this.
Poisson Distribution
p1 <- rpois(n = 1000, lambda = 1)
head(p1)
## [1] 1 2 1 1 3 1
p2 <- rpois(n = 1000, lambda = 2)
head(p2)
## [1] 0 0 1 1 0 3
p5 <- rpois(n = 1000, lambda = 5)
head(p2)
## [1] 0 0 1 1 0 3
p10 <- rpois(n = 1000, lambda = 10)
head(p10)
## [1] 10 13 8 10 14 7
p20 <- rpois(n = 1000, lambda = 20)
head(p20)
## [1] 13 18 23 15 18 23
pois <- data.frame(lambda1 = p1, lambda2 = p2, lambda5 = p5, lambda10 = p10, lambda20 = p20)
head(pois)
## lambda1 lambda2 lambda5 lambda10 lambda20
## 1 1 0 3 10 13
## 2 2 0 2 13 18
## 3 1 1 7 8 23
## 4 1 1 6 10 15
## 5 3 0 3 14 18
## 6 1 3 3 7 23
library(reshape2)
pois <- melt(data = pois, variable.name = "lambda", value.name = "x")
## No id variables; using all as measure variables
head(pois)
## lambda x
## 1 lambda1 1
## 2 lambda1 2
## 3 lambda1 1
## 4 lambda1 1
## 5 lambda1 3
## 6 lambda1 1
tail(pois)
## lambda x
## 4995 lambda20 26
## 4996 lambda20 21
## 4997 lambda20 20
## 4998 lambda20 18
## 4999 lambda20 16
## 5000 lambda20 17
library(stringr)
pois$lambda <- str_extract(string = pois$lambda, pattern = "\\d+")
head(pois)
## lambda x
## 1 1 1
## 2 1 2
## 3 1 1
## 4 1 1
## 5 1 3
## 6 1 1
class(pois$lambda)
## [1] "character"
pois$lambda <- as.factor(as.numeric(pois$lambda))
tail(pois)
## lambda x
## 4995 20 26
## 4996 20 21
## 4997 20 20
## 4998 20 18
## 4999 20 16
## 5000 20 17
ggplot(pois, aes(x = x)) + geom_density(mapping = aes(group = lambda, color = lambda, fill = lambda), adjust = 4, alpha = 1/2) + scale_color_discrete() + scale_fill_discrete() + ggtitle("Probability Mass Function")
Basic Statistics
Mean
x <- sample(x = 1:100, size = 100, replace = TRUE)
x
## [1] 57 55 69 70 51 60 8 5 96 58 41 88 59 99 83 76 88 83 40 2 19 41 55
## [24] 74 35 94 33 21 80 38 54 94 44 71 37 85 67 72 44 18 30 37 31 86 87 21
## [47] 78 52 31 78 6 13 49 77 62 1 33 31 93 40 72 70 12 17 31 77 85 61 6
## [70] 19 2 71 61 16 3 69 21 6 48 58 58 23 61 80 93 25 7 3 21 11 72 51
## [93] 41 15 5 94 30 91 51 17
mean(x)
## [1] 48.54
y <- x
y[sample(1:100, 20, replace = FALSE)] <- NA
y
## [1] 57 55 69 70 NA 60 8 5 NA 58 41 NA 59 99 83 76 88 NA 40 2 NA NA 55
## [24] 74 35 94 33 21 NA 38 54 94 44 NA 37 85 NA 72 44 18 30 37 31 86 87 21
## [47] NA 52 31 78 6 13 49 NA 62 1 33 NA 93 40 72 NA NA 17 31 77 85 NA 6
## [70] 19 NA 71 61 16 3 69 21 6 48 58 58 23 61 80 NA 25 7 3 21 11 72 51
## [93] 41 NA NA 94 30 NA 51 17
mean(y)
## [1] NA
mean(y, na.rm = TRUE)
## [1] 46.5375
z <- c(95, 72, 23, 34)
w <- c(1/2, 1/4, 1/8, 1/8)
mean(z)
## [1] 56
weighted.mean(x = z, w = w)
## [1] 72.625
Variance and Standard Deviation
var(x)
## [1] 832.3317
mean(x)
## [1] 48.54
sum((x - mean(x)) ** 2) / (length(x) - 1)
## [1] 832.3317
sqrt(sum((x - mean(x)) ** 2) / (length(x) - 1))
## [1] 28.85016
sd(x)
## [1] 28.85016
sd(y)
## [1] NA
sd(y, na.rm = TRUE)
## [1] 27.88082
Common Functions
sum(x)
## [1] 4854
min(x)
## [1] 1
max(x)
## [1] 99
sum(y)
## [1] NA
sum(y, na.rm = TRUE)
## [1] 3723
summary(x)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 21.00 51.00 48.54 72.00 99.00
summary(y)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 21.00 46.00 46.54 70.25 99.00 20
quantile(x = x, probs = c(0.25, 0.75))
## 25% 75%
## 21 72
quantile(x, seq(0, 1, length = 10))
## 0% 11.11111% 22.22222% 33.33333% 44.44444% 55.55556% 66.66667%
## 1 8 21 31 41 55 67
## 77.77778% 88.88889% 100%
## 76 86 99
Correlation
library(ggplot2)
data("economics")
head(economics)
## date pce pop psavert uempmed unemploy
## 1 1967-06-30 507.8 198712 9.8 4.5 2944
## 2 1967-07-31 510.9 198911 9.8 4.7 2945
## 3 1967-08-31 516.7 199113 9.0 4.6 2958
## 4 1967-09-30 513.3 199311 9.8 4.9 3143
## 5 1967-10-31 518.5 199498 9.7 4.7 3066
## 6 1967-11-30 526.2 199657 9.4 4.8 3018
cor(economics$pce, economics$psavert)
## [1] -0.9271222
cor(economics[, 4:6])
## psavert uempmed unemploy
## psavert 1.00000000 -0.3615301 -0.07641651
## uempmed -0.36153012 1.0000000 0.78427918
## unemploy -0.07641651 0.7842792 1.00000000
x <- cor(economics[, 4:6])
library(reshape2)
y <- melt(data = x, varnames = c("x", "y"), value.name = "Correlation")
y
## x y Correlation
## 1 psavert psavert 1.00000000
## 2 uempmed psavert -0.36153012
## 3 unemploy psavert -0.07641651
## 4 psavert uempmed -0.36153012
## 5 uempmed uempmed 1.00000000
## 6 unemploy uempmed 0.78427918
## 7 psavert unemploy -0.07641651
## 8 uempmed unemploy 0.78427918
## 9 unemploy unemploy 1.00000000
y <- y[order(y$Correlation), ]
y
## x y Correlation
## 2 uempmed psavert -0.36153012
## 4 psavert uempmed -0.36153012
## 3 unemploy psavert -0.07641651
## 7 psavert unemploy -0.07641651
## 6 unemploy uempmed 0.78427918
## 8 uempmed unemploy 0.78427918
## 1 psavert psavert 1.00000000
## 5 uempmed uempmed 1.00000000
## 9 unemploy unemploy 1.00000000
ggplot(data = y, aes(x, y)) + geom_tile(aes(fill = Correlation)) + scale_fill_gradient2(high = "steelblue", guide = guide_colorbar(ticks = FALSE, barheight = 10, limits = c(-1, 1))) + theme_minimal() + labs(x = NULL, NULL)
cor(economics[sapply(economics, is.numeric)], use = "everything")
## pce pop psavert uempmed unemploy
## pce 1.0000000 0.9879665 -0.92712221 0.5145862 0.32441514
## pop 0.9879665 1.0000000 -0.89086571 0.5494114 0.41227726
## psavert -0.9271222 -0.8908657 1.00000000 -0.3615301 -0.07641651
## uempmed 0.5145862 0.5494114 -0.36153012 1.0000000 0.78427918
## unemploy 0.3244151 0.4122773 -0.07641651 0.7842792 1.00000000