Libraries
library(tidyverse)
library(GGally)
library(readxl)
library(knitr)
Function heavily modified from https://stackoverflow.com/a/12429538
avg <- 435
stddev <- 72
trials <- 1e+03
df <- tibble(x = seq(from = avg - 4*stddev,
to = avg + 4*stddev,
by = 8*stddev/trials),
y = dnorm(x, mean = avg, sd = stddev))
p <-
df %>%
ggplot(aes(x, y)) +
geom_line() +
scale_x_continuous(breaks = seq(from = avg - 3*stddev,
to = avg + 3*stddev,
by = stddev)) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Real Estate Broker's Examination Distribution") +
xlab("Scores") +
ylab("Density")
shade <- function(left, right){
shade =
rbind(c(left, 0),
subset(df, left < x & x < right),
c(right, 0))
area = pnorm(right, mean = avg, sd = stddev) - pnorm(left, mean = avg, sd = stddev)
p +
geom_polygon(data = shade, aes(x, y), fill = "blue") +
annotate("text",
x = 1.15*left, # avg - 3.25*stddev,
y = 100*min(df$y), col = "red",
label = paste0("Area: ", round(area, 3)))
}
qnorm(p = 0.35, mean = avg, sd = stddev, lower.tail = FALSE)
## [1] 462.7431
qnorm(p = 1 - 0.35, mean = avg, sd = stddev)
## [1] 462.7431
shade(462.7431, avg + 4*stddev)
shade(440, 560)
pnorm(q = 560, mean = avg, sd = stddev) - pnorm(q = 440, mean = avg, sd = stddev)
## [1] 0.4310458
shade(380, 560)
pnorm(620, mean = avg, sd = stddev) - pnorm(380, mean = avg, sd = stddev)
## [1] 0.7724402
shade(320, 680)
pnorm(680, mean = avg, sd = stddev) - pnorm(320, mean = avg, sd = stddev)
## [1] 0.9445584
shade(410, 590)
pnorm(590, mean = avg, sd = stddev) - pnorm(410, mean = avg, sd = stddev)
## [1] 0.6201197
samp.avg <- 700
avg <- samp.avg
samp.stddev <- 180
n <- 36
stddev <- samp.stddev * sqrt(n)
df <- tibble(x = seq(from = avg - 4*stddev,
to = avg + 4*stddev,
by = 8*stddev/trials),
y = dnorm(x, mean = avg, sd = stddev))
p <-
df %>%
ggplot(aes(x, y)) +
geom_line() +
scale_x_continuous(breaks = seq(from = avg - 3*stddev,
to = avg + 3*stddev,
by = stddev)) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Credit Card Debt after One Year") +
xlab("USD") +
ylab("Density")
shade(280, 3300)
(pnorm(q = 3330, mean = avg, sd = stddev) - pnorm(q = 280, mean = avg, sd = stddev)) * n
## [1] 23.17963
shade(520, 880)
shade(avg - 4*stddev, 850)
shade(850, avg + 4*stddev)
trials <- 11500
avg <- 11.5
stddev <- 2.1
df <- tibble(x = seq(from = avg - 4*stddev,
to = avg + 4*stddev,
by = 8*stddev/trials),
y = dnorm(x, mean = avg, sd = stddev))
p <-
df %>%
ggplot(aes(x, y)) +
geom_line() +
scale_x_continuous(breaks = seq(from = avg - 3*stddev,
to = avg + 3*stddev,
by = stddev)) +
theme(plot.title = element_text(hjust = 0.5)) +
ggtitle("Set of 11500 Values") +
xlab("Value") +
ylab("Density")
shade(11.5, 14)
shade(9, 11.6)
shade(10, 15)
(pnorm(15, mean = avg, sd = stddev) - pnorm(10, mean = avg, sd = stddev)) * trials
## [1] 8218.87
shade(14, avg + 4*stddev)
(1 - pnorm(14, mean = avg, sd = stddev)) * trials
## [1] 1344.691
(pnorm(14, mean = avg, sd = stddev, lower.tail = FALSE)) * trials
## [1] 1344.691
https://www.wikiwand.com/en/68%E2%80%9395%E2%80%9399.7_rule
shade(avg - stddev, avg + stddev)
(pnorm(avg + stddev, mean = avg, sd = stddev) - pnorm(avg - stddev, mean = avg, sd = stddev)) * trials
## [1] 7850.929
(pnorm(1) - pnorm(-1)) * trials
## [1] 7850.929
body.data <- read_excel("G:/My Drive/homework/Yves F/buad502a-m4-expert-dataset-bodyfat.xls")
sqrt(nrow(body.data))
## [1] 15.87451
body.data %>%
ggplot(aes(BODYFAT)) +
geom_histogram(bins = 15, show.legend = FALSE, fill = "grey", color = "black")
body.data %>%
ggplot(aes(WEIGHT)) +
geom_histogram(bins = 15, show.legend = FALSE, fill = "grey", color = "black")
body.data %>%
ggplot(aes(HEIGHT)) +
geom_histogram(bins = 15, show.legend = FALSE, fill = "grey", color = "black")
body.data %>%
select(BODYFAT, WEIGHT, HEIGHT) %>%
ggpairs()
body.data %>%
select(BODYFAT, WEIGHT, HEIGHT) %>%
pivot_longer(names_to = "Variable",
values_to = "Value",
cols = everything()) %>%
ggplot(aes(y = Value)) +
geom_boxplot() +
facet_wrap(~Variable) +
theme(axis.text.x = element_blank())
body.data %>%
select(BODYFAT, WEIGHT, HEIGHT) %>%
pivot_longer(names_to = "Variable",
values_to = "Value",
cols = everything()) %>%
ggplot(aes(y = Value)) +
geom_boxplot() +
facet_wrap(~Variable, scales = "free_y") +
theme(axis.text.x = element_blank())
body.data %>%
ggplot(aes(sample = BODYFAT)) +
geom_qq() +
geom_qq_line(col = "red") +
ggtitle("Normal Probability (QQ) Plot of Body Fat")
body.data %>%
ggplot(aes(sample = WEIGHT)) +
geom_qq() +
geom_qq_line(col = "red") +
ggtitle("Normal Probability (QQ) Plot of Weight")
body.data %>%
ggplot(aes(sample = HEIGHT)) +
geom_qq() +
geom_qq_line(col = "red") +
ggtitle("Normal Probability (QQ) Plot of Height")
body.data %>%
select(BODYFAT, WEIGHT, HEIGHT) %>%
pivot_longer(names_to = "Variable",
values_to = "Value",
cols = everything()) %>%
group_by(Variable) %>%
summarise(Mean = mean(Value),
StdDev = sd(Value))
## # A tibble: 3 x 3
## Variable Mean StdDev
## <chr> <dbl> <dbl>
## 1 BODYFAT 18.9 7.75
## 2 HEIGHT 70.1 3.66
## 3 WEIGHT 179. 29.4
# "within x standard deviations" vs what needs to be removed
n = nrow(body.data)
sd_bin <- function(stddev, value, inclusion){
lower = quantile(value, pnorm(stddev - 1, 0, 1))
upper = quantile(value, pnorm(stddev, 0, 1))
sum(value < upper) - sum(value < lower) # *
}
observed <-
body.data %>%
select(BODYFAT, WEIGHT, HEIGHT) %>%
pivot_longer(names_to = "Obs_SD",
values_to = "Value",
cols = everything()) %>%
group_by(Obs_SD) %>%
summarise("(-inf,-3)" = sum(Value < quantile(Value, pnorm(-3, 0, 1))),
"[-3,-2)" = sd_bin(-2, Value),
"[-2,-1)" = sd_bin(-1, Value),
"[-1,0)" = sd_bin(0, Value),
"(0,1]" = sd_bin(1, Value),
"(1,2]" = sd_bin(2, Value),
"(2,3]" = sd_bin(3, Value),
"(3,inf)" = sum(Value > quantile(Value, pnorm(3, 0, 1)))) %>%
rowwise() %>%
mutate(Total = sum(c_across(-Obs_SD)))
expected <-
tibble(Exp_SD = "Theoretical",
"(-inf,-3)" = pnorm(-3) * n,
"[-3,-2)" = (pnorm(-2) - pnorm(-3)) * n,
"[-2,-1)" = (pnorm(-1) - pnorm(-2)) * n,
"[-1,0)" = (pnorm(0) - pnorm(-1)) * n,
"(0,1]" = `[-1,0)`,
"(1,2]" = `[-2,-1)`,
"(2,3]" = `[-3,-2)`,
"(3,inf)" = `(-inf,-3)`) %>%
mutate(across(.cols = -Exp_SD,
.fns = round, 2)) %>%
rowwise() %>%
mutate(Total = sum(c_across(-Exp_SD)))
bind_rows(observed, expected) %>% kable()
Obs_SD | (-inf,-3) | [-3,-2) | [-2,-1) | [-1,0) | (0,1] | (1,2] | (2,3] | (3,inf) | Total | Exp_SD |
---|---|---|---|---|---|---|---|---|---|---|
BODYFAT | 1.00 | 5.00 | 34.00 | 83.00 | 89.00 | 34.00 | 5.00 | 1.00 | 252 | NA |
HEIGHT | 1.00 | 4.00 | 28.00 | 84.00 | 95.00 | 34.00 | 5.00 | 1.00 | 252 | NA |
WEIGHT | 1.00 | 4.00 | 34.00 | 87.00 | 85.00 | 35.00 | 5.00 | 1.00 | 252 | NA |
NA | 0.34 | 5.39 | 34.25 | 86.02 | 86.02 | 34.25 | 5.39 | 0.34 | 252 | Theoretical |
expected <- # mu = 0, sigma = 1
c(pnorm(-3), # left tail
pnorm(-2) - pnorm(-3), # -3 sd
pnorm(-1) - pnorm(-2), # -2 sd
pnorm(0) - pnorm(-1), # -1 sd
pnorm(1) - pnorm(0), # +1 sd
pnorm(2) - pnorm(1), # +2 sd
pnorm(3) - pnorm(2), # +3 sd
pnorm(3, lower.tail = FALSE) # right tail
)
observed.fun <- function(df, caption){
xbar <- mean(df)
s <- sd(df) # not sample sd
breaks.df <-
c(min(df),
xbar - 3 * s,
xbar - 2 * s,
xbar - 1 * s,
xbar,
xbar + 1 * s,
xbar + 2 * s,
xbar + 3 * s,
max(df)
)
counts <- hist(df, plot = FALSE, breaks = breaks.df)$count
hist(df,
labels = TRUE,
breaks = breaks.df,
freq = TRUE,
main = caption,
ylim = c(0, max(counts) + 10))
return(counts)
}
tibble(Observed = observed.fun(body.data$BODYFAT, "Body Fat"),
Expected = expected * n)
## # A tibble: 8 x 2
## Observed Expected
## <int> <dbl>
## 1 1 0.340
## 2 1 5.39
## 3 44 34.2
## 4 77 86.0
## 5 89 86.0
## 6 36 34.2
## 7 3 5.39
## 8 1 0.340
tibble(Observed = observed.fun(body.data$WEIGHT, "Weight"),
Expected = expected * n)
## # A tibble: 8 x 2
## Observed Expected
## <int> <dbl>
## 1 1 0.340
## 2 0 5.39
## 3 31 34.2
## 4 107 86.0
## 5 75 86.0
## 6 32 34.2
## 7 5 5.39
## 8 1 0.340
tibble(Observed = observed.fun(body.data$HEIGHT, "Height"),
Expected = expected * n)
## # A tibble: 8 x 2
## Observed Expected
## <int> <dbl>
## 1 1 0.340
## 2 0 5.39
## 3 15 34.2
## 4 111 86.0
## 5 101 86.0
## 6 22 34.2
## 7 2 5.39
## 8 0 0.340