Libraries

library(tidyverse)
library(GGally)
library(readxl)
library(knitr)

Question 1

Create theoretical distribution

Function heavily modified from https://stackoverflow.com/a/12429538

avg <- 435
stddev <- 72
trials <- 1e+03

df <- tibble(x = seq(from = avg - 4*stddev,
                     to = avg + 4*stddev,
                     by = 8*stddev/trials),
             y = dnorm(x, mean = avg, sd = stddev))

p <- 
  df %>%
  ggplot(aes(x, y)) +
  geom_line() +
  scale_x_continuous(breaks = seq(from = avg - 3*stddev,
                                  to = avg + 3*stddev,
                                  by = stddev)) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Real Estate Broker's Examination Distribution") +
  xlab("Scores") +
  ylab("Density")

shade <- function(left, right){
  shade =
    rbind(c(left, 0),
          subset(df, left < x & x < right),
          c(right, 0))
  
  area = pnorm(right, mean = avg, sd = stddev) - pnorm(left, mean = avg, sd = stddev)
  
  p + 
    geom_polygon(data = shade, aes(x, y), fill = "blue") +
    annotate("text",
             x = 1.15*left, # avg - 3.25*stddev,
             y = 100*min(df$y), col = "red",
             label = paste0("Area: ", round(area, 3)))
}

Part (0)

qnorm(p = 0.35, mean = avg, sd = stddev, lower.tail = FALSE)
## [1] 462.7431
qnorm(p = 1 - 0.35, mean = avg, sd = stddev)
## [1] 462.7431
shade(462.7431, avg + 4*stddev)

Part (a)

shade(440, 560)

pnorm(q = 560, mean = avg, sd = stddev) - pnorm(q = 440, mean = avg, sd = stddev)
## [1] 0.4310458

Part (b)

shade(380, 560)

pnorm(620, mean = avg, sd = stddev) - pnorm(380, mean = avg, sd = stddev)
## [1] 0.7724402

Part (c)

shade(320, 680)

pnorm(680, mean = avg, sd = stddev) - pnorm(320, mean = avg, sd = stddev)
## [1] 0.9445584

Part (d)

shade(410, 590)

pnorm(590, mean = avg, sd = stddev) - pnorm(410, mean = avg, sd = stddev)
## [1] 0.6201197

Question 2

Create theoretical distribution

samp.avg <- 700
avg <- samp.avg
samp.stddev <- 180
n <- 36
stddev <- samp.stddev * sqrt(n)

df <- tibble(x = seq(from = avg - 4*stddev,
                     to = avg + 4*stddev,
                     by = 8*stddev/trials),
             y = dnorm(x, mean = avg, sd = stddev))

p <- 
  df %>%
  ggplot(aes(x, y)) +
  geom_line() +
  scale_x_continuous(breaks = seq(from = avg - 3*stddev,
                                  to = avg + 3*stddev,
                                  by = stddev)) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Credit Card Debt after One Year") +
  xlab("USD") +
  ylab("Density")

Part (a)

shade(280, 3300)

(pnorm(q = 3330, mean = avg, sd = stddev) - pnorm(q = 280, mean = avg, sd = stddev)) * n
## [1] 23.17963

Part (b)

shade(520, 880)

Part (c)

shade(avg - 4*stddev, 850)

Part (d)

shade(850, avg + 4*stddev)

Question 3

Create theoretical distribution

trials <- 11500
avg <- 11.5
stddev <- 2.1

df <- tibble(x = seq(from = avg - 4*stddev,
                     to = avg + 4*stddev,
                     by = 8*stddev/trials),
             y = dnorm(x, mean = avg, sd = stddev))

p <- 
  df %>%
  ggplot(aes(x, y)) +
  geom_line() +
  scale_x_continuous(breaks = seq(from = avg - 3*stddev,
                                  to = avg + 3*stddev,
                                  by = stddev)) +
  theme(plot.title = element_text(hjust = 0.5)) +
  ggtitle("Set of 11500 Values") +
  xlab("Value") +
  ylab("Density")

Part (a)

shade(11.5, 14)

Part (b)

shade(9, 11.6)

Part (c)

shade(10, 15)

(pnorm(15, mean = avg, sd = stddev) - pnorm(10, mean = avg, sd = stddev)) * trials
## [1] 8218.87

Part (d)

shade(14, avg + 4*stddev)

(1  - pnorm(14, mean = avg, sd = stddev)) * trials
## [1] 1344.691
(pnorm(14, mean = avg, sd = stddev, lower.tail = FALSE)) * trials
## [1] 1344.691

Part (e)

https://www.wikiwand.com/en/68%E2%80%9395%E2%80%9399.7_rule

shade(avg - stddev, avg + stddev)

(pnorm(avg + stddev, mean = avg, sd = stddev) - pnorm(avg - stddev, mean = avg, sd = stddev)) * trials
## [1] 7850.929
(pnorm(1) - pnorm(-1)) * trials
## [1] 7850.929

Question 4

Data

body.data <- read_excel("G:/My Drive/homework/Yves F/buad502a-m4-expert-dataset-bodyfat.xls")
sqrt(nrow(body.data))
## [1] 15.87451

Part (a)

body.data %>%
  ggplot(aes(BODYFAT)) +
  geom_histogram(bins = 15, show.legend = FALSE, fill = "grey", color = "black")

Part (b)

body.data %>%
  ggplot(aes(WEIGHT)) +
  geom_histogram(bins = 15, show.legend = FALSE, fill = "grey", color = "black")

Part (c)

body.data %>%
  ggplot(aes(HEIGHT)) +
  geom_histogram(bins = 15, show.legend = FALSE, fill = "grey", color = "black")

Part (c)(a)

body.data %>% 
  select(BODYFAT, WEIGHT, HEIGHT) %>%
  ggpairs()

body.data %>%
  select(BODYFAT, WEIGHT, HEIGHT) %>%
  pivot_longer(names_to = "Variable",
               values_to = "Value",
               cols = everything()) %>%
  ggplot(aes(y = Value)) +
  geom_boxplot() +
  facet_wrap(~Variable) +
  theme(axis.text.x = element_blank())

body.data %>%
  select(BODYFAT, WEIGHT, HEIGHT) %>%
  pivot_longer(names_to = "Variable",
               values_to = "Value",
               cols = everything()) %>%
  ggplot(aes(y = Value)) +
  geom_boxplot() +
  facet_wrap(~Variable, scales = "free_y") +
  theme(axis.text.x = element_blank())

Part (c)(b)

body.data %>%
  ggplot(aes(sample = BODYFAT)) +
  geom_qq() +
  geom_qq_line(col = "red") +
  ggtitle("Normal Probability (QQ) Plot of Body Fat")

Part (c)(d)

body.data %>%
  ggplot(aes(sample = WEIGHT)) +
  geom_qq() +
  geom_qq_line(col = "red") +
  ggtitle("Normal Probability (QQ) Plot of Weight")

Part (c)(d)

body.data %>%
  ggplot(aes(sample = HEIGHT)) +
  geom_qq() +
  geom_qq_line(col = "red") +
  ggtitle("Normal Probability (QQ) Plot of Height")

Part (c)(f)

body.data %>%
  select(BODYFAT, WEIGHT, HEIGHT) %>%
  pivot_longer(names_to = "Variable",
               values_to = "Value",
               cols = everything()) %>%
  group_by(Variable) %>%
  summarise(Mean = mean(Value),
            StdDev = sd(Value))
## # A tibble: 3 x 3
##   Variable  Mean StdDev
##   <chr>    <dbl>  <dbl>
## 1 BODYFAT   18.9   7.75
## 2 HEIGHT    70.1   3.66
## 3 WEIGHT   179.   29.4

Part (c)(g)

Version 1

# "within x standard deviations" vs what needs to be removed
n = nrow(body.data)

sd_bin <- function(stddev, value, inclusion){
  lower = quantile(value, pnorm(stddev - 1, 0, 1))
  upper = quantile(value, pnorm(stddev, 0, 1))
  sum(value < upper) - sum(value < lower) # *
}

observed <-
  body.data %>%
  select(BODYFAT, WEIGHT, HEIGHT) %>%
  pivot_longer(names_to = "Obs_SD",
               values_to = "Value",
               cols = everything()) %>%
  group_by(Obs_SD) %>%
  summarise("(-inf,-3)" = sum(Value < quantile(Value, pnorm(-3, 0, 1))),
            "[-3,-2)" = sd_bin(-2, Value),
            "[-2,-1)" = sd_bin(-1, Value),
            "[-1,0)" = sd_bin(0, Value),
            "(0,1]" = sd_bin(1, Value),
            "(1,2]" = sd_bin(2, Value),
            "(2,3]" = sd_bin(3, Value),
            "(3,inf)" = sum(Value > quantile(Value, pnorm(3, 0, 1)))) %>%
  rowwise() %>%
  mutate(Total = sum(c_across(-Obs_SD)))

expected <-   
  tibble(Exp_SD = "Theoretical",
         "(-inf,-3)" = pnorm(-3) * n,
         "[-3,-2)" = (pnorm(-2) - pnorm(-3)) * n,
         "[-2,-1)" = (pnorm(-1) - pnorm(-2)) * n,
         "[-1,0)" = (pnorm(0) - pnorm(-1)) * n,
         "(0,1]" = `[-1,0)`,
         "(1,2]" = `[-2,-1)`,
         "(2,3]" = `[-3,-2)`,
         "(3,inf)" = `(-inf,-3)`) %>%
  mutate(across(.cols = -Exp_SD,
                .fns = round, 2)) %>%
  rowwise() %>%
  mutate(Total = sum(c_across(-Exp_SD)))

bind_rows(observed, expected) %>% kable()
Obs_SD (-inf,-3) [-3,-2) [-2,-1) [-1,0) (0,1] (1,2] (2,3] (3,inf) Total Exp_SD
BODYFAT 1.00 5.00 34.00 83.00 89.00 34.00 5.00 1.00 252 NA
HEIGHT 1.00 4.00 28.00 84.00 95.00 34.00 5.00 1.00 252 NA
WEIGHT 1.00 4.00 34.00 87.00 85.00 35.00 5.00 1.00 252 NA
NA 0.34 5.39 34.25 86.02 86.02 34.25 5.39 0.34 252 Theoretical

Version 2

expected <- # mu = 0, sigma = 1
  c(pnorm(-3), # left tail
    pnorm(-2) - pnorm(-3), # -3 sd
    pnorm(-1) - pnorm(-2), # -2 sd
    pnorm(0) - pnorm(-1), # -1 sd
    pnorm(1) - pnorm(0), # +1 sd
    pnorm(2) - pnorm(1), # +2 sd
    pnorm(3) - pnorm(2), # +3 sd
    pnorm(3, lower.tail = FALSE) # right tail
    )

observed.fun <- function(df, caption){
  xbar <- mean(df)
  s <- sd(df) # not sample sd
  
  breaks.df <-
    c(min(df),
      xbar - 3 * s,
      xbar - 2 * s,
      xbar - 1 * s,
      xbar,
      xbar + 1 * s,
      xbar + 2 * s,
      xbar + 3 * s,
      max(df)
      )
  
  counts <- hist(df, plot = FALSE, breaks = breaks.df)$count
  
  hist(df,
       labels = TRUE,
       breaks = breaks.df,
       freq = TRUE,
       main = caption,
       ylim = c(0, max(counts) + 10))
  
  return(counts)
  
}

tibble(Observed = observed.fun(body.data$BODYFAT, "Body Fat"),
       Expected = expected * n)

## # A tibble: 8 x 2
##   Observed Expected
##      <int>    <dbl>
## 1        1    0.340
## 2        1    5.39 
## 3       44   34.2  
## 4       77   86.0  
## 5       89   86.0  
## 6       36   34.2  
## 7        3    5.39 
## 8        1    0.340
tibble(Observed = observed.fun(body.data$WEIGHT, "Weight"),
       Expected = expected * n)

## # A tibble: 8 x 2
##   Observed Expected
##      <int>    <dbl>
## 1        1    0.340
## 2        0    5.39 
## 3       31   34.2  
## 4      107   86.0  
## 5       75   86.0  
## 6       32   34.2  
## 7        5    5.39 
## 8        1    0.340
tibble(Observed = observed.fun(body.data$HEIGHT, "Height"),
       Expected = expected * n)

## # A tibble: 8 x 2
##   Observed Expected
##      <int>    <dbl>
## 1        1    0.340
## 2        0    5.39 
## 3       15   34.2  
## 4      111   86.0  
## 5      101   86.0  
## 6       22   34.2  
## 7        2    5.39 
## 8        0    0.340