getwd()
## [1] "/Users/isaiahmireles/Desktop"
population_dat <- read_dta("SamplingPrac/criminals.dta")
population_dat
## # A tibble: 3,000 × 3
##    height finger popsize
##     <dbl>  <dbl>   <dbl>
##  1     56    100    3000
##  2     57    103    3000
##  3     58     99    3000
##  4     58    102    3000
##  5     58    102    3000
##  6     58    103    3000
##  7     58    104    3000
##  8     58    107    3000
##  9     59    100    3000
## 10     59    101    3000
## # ℹ 2,990 more rows
  1. In the population of 3000 criminals, what values do you obtain from Stata?
mu <- mean(population_dat$height)
mu
## [1] 65.473
?sd # divides by N-1 : Samp est
N <- length(population_dat$height)  
# True population variance : 
sigma2 <- (N-1)*(sd(population_dat$height)^2)*(1/N)
sigma2 # based on units of samp
## [1] 6.539938

\[ \hat{\sigma}^2=s^2 = \frac{1}{N - 1} \sum_{i=1}^{N} (x_i - \bar{x})^2 \]

\[ \sigma^2 = \frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2 \]

coefficient_variation <- sqrt(sigma2)/mu # unitless
coefficient_variation
## [1] 0.03905931
SD_samp <- sd(population_dat$height)
SD_samp
## [1] 2.557757
# Notice :   
SD_samp > sigma2 # FALSE
## [1] FALSE
  1. Estimating height of criminals from a sample. Calculate the appropriate Sample Size (n)

\[ n=\frac{N\sigma^2}{ND+\sigma^2} \text{ For : }D=(\frac{B}{Z_{\alpha/2}})^2 \]

and suppose we wish to be 95% confident (1.96 \(\approx\) 2 ) with a margin of error of 1/2 inch

so :

\[ D = \frac{B^2}{4} \text{ For : } B =0.5 \text{ inch} \]

B <- .5
Z_crit <- qnorm(0.975) # exact
Z_crit
## [1] 1.959964
D <- (B/Z_crit)^2
D
## [1] 0.06507944
# round to whole number 
n <- round((N*sigma2)/(N*D+sigma2),0)
n
## [1] 97

If you notices, i didnt use 2 to approximate but rather exact value

Mean finger length :

mu2 <- mean(population_dat$finger)
mu2
## [1] 115.4737

Simple Random Sample!!!! (WOR)

Notice we are using a finite-population and therefore should SRS-WOR :

?sample
index <- sample(
  x = seq_along(population_dat$height),
  size = n,
  replace = FALSE
)
sample <- population_dat$height[index]
hist(sample)

hist(population_dat$height)

mean from sample

sample_mean <- mean(sample)
sample_mean
## [1] 65.50515
s <- sd(sample)

Recall the standard error for a sample average :

\[ S.E._{\bar{x}}=\frac{s}{\sqrt{n}}*\sqrt{FPC} \]

FPC <- (N-n)/(N-1)
FPC
## [1] 0.9679893
FPC > 1-n/N # Notice the true is bigger
## [1] TRUE
SE_xbar <- (s/sqrt(n))* sqrt(FPC)
SE_xbar
## [1] 0.2526885

Confidence interval

ME <- SE_xbar*Z_crit
ci <- sample_mean + c(-1,1)*ME
ci
## [1] 65.00989 66.00041
# generate plt
plot(
  x = ci,
  y = c(1, 1),
  type = "n",
  xlab = "Mean",
  ylab = "",
  yaxt = "n",
  main = "Confidence Interval for SRS"
)

# Add range of confidence
segments(
  x0 = ci[1],
  x1 = ci[2],
  y0 = 1,
  y1 = 1,
  lwd = 3
)

# add pt for est
points(
  x = sample_mean,
  y = 1,
  pch = 19
)

# add line for true mu
abline(
  v = mu,
  lty = 2,
  col = "red"
)

okay, to get an idea of the relative efficiency of the estimator, we may graph the relationship of the auxiliary variable :

plot(population_dat$height, population_dat$finger)

plot(population_dat$height[index], population_dat$finger[index])

Mean finger length

df <-
  data.frame(height=population_dat$height[index], finger=population_dat$finger[index])
xbar_finger <- mean(df$finger)
xbar_finger
## [1] 116.1134

Ratio Est of \(\hat{\mu}_{\text{Height}}\)

Recall :

\[ r = \frac{\bar{y}}{\bar{x}} \]

r <- mean(df$height)/mean(df$finger)

and we say :

\[ r*\mu_x =\hat{\mu}_{y} \]

ratio_est <- mean(population_dat$finger)*r
ratio_est
## [1] 65.14425

And to est the Standard deviation of ratio estimate :

\[ \text{Var}(\hat{\mu}_y)=\mu_{x}^2 \text{Var}(r) \]

\[ \text{For : }\text{Var}(r)=\frac{s^2_r}{n} \]

\[ \text{For : }s_r^2=\frac{\sum(y_i - r*x_i)^2}{n-1} \]

or,

\[ \text{FPC}*\text{Var}(r) \]

s_r2 <- (1/(n-1))*sum((df$height-(df$finger*r))^2)
s_r2
## [1] 5.540488
var_ratio_est <- FPC*(s_r2/n)
var_ratio_est
## [1] 0.05529003
SD_ratio_est <- sqrt(var_ratio_est)
SD_ratio_est
## [1] 0.2351383

Confidence interval

ME_ratio <- Z_crit*SD_ratio_est
ci_ratio <- ratio_est + c(-1,1)*ME_ratio
# generate plot
plot(
  x = ci_ratio,
  y = c(1, 1),
  type = "n",
  xlab = "Ratio Estimate",
  ylab = "",
  yaxt = "n",
  main = "Confidence Interval for Ratio Estimator"
)

# Add CI segment
segments(
  x0 = ci_ratio[1],
  x1 = ci_ratio[2],
  y0 = 1,
  y1 = 1,
  lwd = 3
)

# Add point for ratio estimate
points(
  x = ratio_est,
  y = 1,
  pch = 19
)

# Add vertical line for true value
abline(
  v = mu,
  lty = 2,
  col = "red"
)

abs(mu - ratio_est)
## [1] 0.3287509
abs(mu - sample_mean)
## [1] 0.03215464

As we can see, our ratio est performed worse and was more off but the interval was smaller.

Regression Est of \(\hat{\mu}_{\text{Height}}\)

# Done by R
mdl <- lm(height~finger,dat=df)
mdl
## 
## Call:
## lm(formula = height ~ finger, data = df)
## 
## Coefficients:
## (Intercept)       finger  
##     30.3888       0.3024

Done by hand :

y <- df$height
x <- df$finger

ybar <- mean(y)
xbar <- mean(x)

# slope
b <- sum((y - ybar)*(x - xbar)) /
     sum((x - xbar)^2)
b
## [1] 0.3024312
# intercept
a <- ybar - b*xbar
a
## [1] 30.38884
mu_reg_est <- ybar + b*(mean(population_dat$finger)
 - xbar)
mu_reg_est
## [1] 65.31168

Estimate \(s^2_{yL}\)

MSE <- summary(mdl)$sigma^2
MSE
## [1] 3.013452
V_reg <- FPC*(MSE/n)
V_reg
## [1] 0.03007206
SE <- sqrt(V_reg)
SE
## [1] 0.173413

Confidence interval

ME <- SE*Z_crit
ci_reg <- mu_reg_est+c(-1,1)*ME
ci_reg
## [1] 64.97180 65.65156

Plot!!!

grep("^ci", ls(), value = T)
## [1] "ci"       "ci_ratio" "ci_reg"
ci_SRS <- ci
ci_SRS
## [1] 65.00989 66.00041
ci_ratio
## [1] 64.68339 65.60511
ci_reg
## [1] 64.97180 65.65156

Okay, now time to plot them all togehter

# Combine all interval endpoints to control x-axis range
x_range <- range(c(ci_SRS, ci_ratio, ci_reg, mu))

# Create empty plot
plot(
  x = x_range,
  y = c(1, 3),
  type = "n",
  xlab = "Parameter Value",
  ylab = "",
  yaxt = "n",
  main = "Comparison of Confidence Intervals"
)

# Add y-axis labels
axis(
  side = 2,
  at = c(1, 2, 3),
  labels = c("SRS", "Ratio", "Regression"),
  las = 1
)

# --- SRS CI ---
segments(
  x0 = ci_SRS[1],
  x1 = ci_SRS[2],
  y0 = 1,
  y1 = 1,
  lwd = 3
)

# --- Ratio CI ---
segments(
  x0 = ci_ratio[1],
  x1 = ci_ratio[2],
  y0 = 2,
  y1 = 2,
  lwd = 3
)

# --- Regression CI ---
segments(
  x0 = ci_reg[1],
  x1 = ci_reg[2],
  y0 = 3,
  y1 = 3,
  lwd = 3
)

# Add vertical line for true mu
abline(
  v = mu,
  lty = 2,
  col = "red",
  lwd = 2
)