getwd()

## [1] "/Users/isaiahmireles/Desktop"

population_dat <- read_dta("SamplingPrac/criminals.dta")
population_dat

## # A tibble: 3,000 × 3
##    height finger popsize
##     <dbl>  <dbl>   <dbl>
##  1     56    100    3000
##  2     57    103    3000
##  3     58     99    3000
##  4     58    102    3000
##  5     58    102    3000
##  6     58    103    3000
##  7     58    104    3000
##  8     58    107    3000
##  9     59    100    3000
## 10     59    101    3000
## # ℹ 2,990 more rows

In the population of 3000 criminals, what values do you obtain from Stata?

mu <- mean(population_dat$height)
mu

## [1] 65.473

?sd # divides by N-1 : Samp est
N <- length(population_dat$height)  
# True population variance : 
sigma2 <- (N-1)*(sd(population_dat$height)^2)*(1/N)
sigma2 # based on units of samp

## [1] 6.539938

\[ \hat{\sigma}^2=s^2 = \frac{1}{N - 1} \sum_{i=1}^{N} (x_i - \bar{x})^2 \]

\[ \sigma^2 = \frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2 \]

coefficient_variation <- sqrt(sigma2)/mu # unitless
coefficient_variation

## [1] 0.03905931

SD_samp <- sd(population_dat$height)
SD_samp

## [1] 2.557757

# Notice :   
SD_samp > sigma2 # FALSE

## [1] FALSE

Estimating height of criminals from a sample. Calculate the appropriate Sample Size (n)

\[ n=\frac{N\sigma^2}{ND+\sigma^2} \text{ For : }D=(\frac{B}{Z_{\alpha/2}})^2 \]

and suppose we wish to be 95% confident (1.96 \(\approx\) 2 ) with a margin of error of 1/2 inch

so :

\[ D = \frac{B^2}{4} \text{ For : } B =0.5 \text{ inch} \]

B <- .5
Z_crit <- qnorm(0.975) # exact
Z_crit

## [1] 1.959964

D <- (B/Z_crit)^2
D

## [1] 0.06507944

# round to whole number 
n <- round((N*sigma2)/(N*D+sigma2),0)
n

## [1] 97

If you notices, i didnt use 2 to approximate but rather exact value

Mean finger length :

mu2 <- mean(population_dat$finger)
mu2

## [1] 115.4737

Simple Random Sample!!!! (WOR)

Notice we are using a finite-population and therefore should SRS-WOR :

?sample
index <- sample(
  x = seq_along(population_dat$height),
  size = n,
  replace = FALSE
)
sample <- population_dat$height[index]
hist(sample)

hist(population_dat$height)

mean from sample

sample_mean <- mean(sample)
sample_mean

## [1] 65.50515

s <- sd(sample)

Recall the standard error for a sample average :

\[ S.E._{\bar{x}}=\frac{s}{\sqrt{n}}*\sqrt{FPC} \]

FPC <- (N-n)/(N-1)
FPC

## [1] 0.9679893

FPC > 1-n/N # Notice the true is bigger

## [1] TRUE

SE_xbar <- (s/sqrt(n))* sqrt(FPC)
SE_xbar

## [1] 0.2526885

Confidence interval

ME <- SE_xbar*Z_crit
ci <- sample_mean + c(-1,1)*ME
ci

## [1] 65.00989 66.00041

# generate plt
plot(
  x = ci,
  y = c(1, 1),
  type = "n",
  xlab = "Mean",
  ylab = "",
  yaxt = "n",
  main = "Confidence Interval for SRS"
)

# Add range of confidence
segments(
  x0 = ci[1],
  x1 = ci[2],
  y0 = 1,
  y1 = 1,
  lwd = 3
)

# add pt for est
points(
  x = sample_mean,
  y = 1,
  pch = 19
)

# add line for true mu
abline(
  v = mu,
  lty = 2,
  col = "red"
)

okay, to get an idea of the relative efficiency of the estimator, we may graph the relationship of the auxiliary variable :

plot(population_dat$height, population_dat$finger)

plot(population_dat$height[index], population_dat$finger[index])

Mean finger length

df <-
  data.frame(height=population_dat$height[index], finger=population_dat$finger[index])
xbar_finger <- mean(df$finger)
xbar_finger

## [1] 116.1134

Ratio Est of \(\hat{\mu}_{\text{Height}}\)

Recall :

\[ r = \frac{\bar{y}}{\bar{x}} \]

r <- mean(df$height)/mean(df$finger)

and we say :

\[ r*\mu_x =\hat{\mu}_{y} \]

ratio_est <- mean(population_dat$finger)*r
ratio_est

## [1] 65.14425

And to est the Standard deviation of ratio estimate :

\[ \text{Var}(\hat{\mu}_y)=\mu_{x}^2 \text{Var}(r) \]

\[ \text{For : }\text{Var}(r)=\frac{s^2_r}{n} \]

\[ \text{For : }s_r^2=\frac{\sum(y_i - r*x_i)^2}{n-1} \]

or,

\[ \text{FPC}*\text{Var}(r) \]

s_r2 <- (1/(n-1))*sum((df$height-(df$finger*r))^2)
s_r2

## [1] 5.540488

var_ratio_est <- FPC*(s_r2/n)
var_ratio_est

## [1] 0.05529003

SD_ratio_est <- sqrt(var_ratio_est)
SD_ratio_est

## [1] 0.2351383

Confidence interval

ME_ratio <- Z_crit*SD_ratio_est
ci_ratio <- ratio_est + c(-1,1)*ME_ratio

# generate plot
plot(
  x = ci_ratio,
  y = c(1, 1),
  type = "n",
  xlab = "Ratio Estimate",
  ylab = "",
  yaxt = "n",
  main = "Confidence Interval for Ratio Estimator"
)

# Add CI segment
segments(
  x0 = ci_ratio[1],
  x1 = ci_ratio[2],
  y0 = 1,
  y1 = 1,
  lwd = 3
)

# Add point for ratio estimate
points(
  x = ratio_est,
  y = 1,
  pch = 19
)

# Add vertical line for true value
abline(
  v = mu,
  lty = 2,
  col = "red"
)

abs(mu - ratio_est)

## [1] 0.3287509

abs(mu - sample_mean)

## [1] 0.03215464

As we can see, our ratio est performed worse and was more off but the interval was smaller.

Regression Est of \(\hat{\mu}_{\text{Height}}\)

# Done by R
mdl <- lm(height~finger,dat=df)
mdl

## 
## Call:
## lm(formula = height ~ finger, data = df)
## 
## Coefficients:
## (Intercept)       finger  
##     30.3888       0.3024

Done by hand :

y <- df$height
x <- df$finger

ybar <- mean(y)
xbar <- mean(x)

# slope
b <- sum((y - ybar)*(x - xbar)) /
     sum((x - xbar)^2)
b

## [1] 0.3024312

# intercept
a <- ybar - b*xbar
a

## [1] 30.38884

mu_reg_est <- ybar + b*(mean(population_dat$finger)
 - xbar)
mu_reg_est

## [1] 65.31168

Estimate \(s^2_{yL}\)

MSE <- summary(mdl)$sigma^2
MSE

## [1] 3.013452

V_reg <- FPC*(MSE/n)
V_reg

## [1] 0.03007206

SE <- sqrt(V_reg)
SE

## [1] 0.173413

Confidence interval

ME <- SE*Z_crit
ci_reg <- mu_reg_est+c(-1,1)*ME
ci_reg

## [1] 64.97180 65.65156

Plot!!!

grep("^ci", ls(), value = T)

## [1] "ci"       "ci_ratio" "ci_reg"

ci_SRS <- ci
ci_SRS

## [1] 65.00989 66.00041

ci_ratio

## [1] 64.68339 65.60511

ci_reg

## [1] 64.97180 65.65156

Okay, now time to plot them all togehter

# Combine all interval endpoints to control x-axis range
x_range <- range(c(ci_SRS, ci_ratio, ci_reg, mu))

# Create empty plot
plot(
  x = x_range,
  y = c(1, 3),
  type = "n",
  xlab = "Parameter Value",
  ylab = "",
  yaxt = "n",
  main = "Comparison of Confidence Intervals"
)

# Add y-axis labels
axis(
  side = 2,
  at = c(1, 2, 3),
  labels = c("SRS", "Ratio", "Regression"),
  las = 1
)

# --- SRS CI ---
segments(
  x0 = ci_SRS[1],
  x1 = ci_SRS[2],
  y0 = 1,
  y1 = 1,
  lwd = 3
)

# --- Ratio CI ---
segments(
  x0 = ci_ratio[1],
  x1 = ci_ratio[2],
  y0 = 2,
  y1 = 2,
  lwd = 3
)

# --- Regression CI ---
segments(
  x0 = ci_reg[1],
  x1 = ci_reg[2],
  y0 = 3,
  y1 = 3,
  lwd = 3
)

# Add vertical line for true mu
abline(
  v = mu,
  lty = 2,
  col = "red",
  lwd = 2
)

HW6 C155

Isaiah C. Mireles

2026-02-19

Simple Random Sample!!!! (WOR)

mean from sample

Confidence interval

Mean finger length

Ratio Est of \(\hat{\mu}_{\text{Height}}\)

Confidence interval

Regression Est of \(\hat{\mu}_{\text{Height}}\)

Confidence interval

Plot!!!