getwd()
## [1] "/Users/isaiahmireles/Desktop"
population_dat <- read_dta("SamplingPrac/criminals.dta")
population_dat
## # A tibble: 3,000 × 3
## height finger popsize
## <dbl> <dbl> <dbl>
## 1 56 100 3000
## 2 57 103 3000
## 3 58 99 3000
## 4 58 102 3000
## 5 58 102 3000
## 6 58 103 3000
## 7 58 104 3000
## 8 58 107 3000
## 9 59 100 3000
## 10 59 101 3000
## # ℹ 2,990 more rows
mu <- mean(population_dat$height)
mu
## [1] 65.473
?sd # divides by N-1 : Samp est
N <- length(population_dat$height)
# True population variance :
sigma2 <- (N-1)*(sd(population_dat$height)^2)*(1/N)
sigma2 # based on units of samp
## [1] 6.539938
\[ \hat{\sigma}^2=s^2 = \frac{1}{N - 1} \sum_{i=1}^{N} (x_i - \bar{x})^2 \]
\[ \sigma^2 = \frac{1}{N} \sum_{i=1}^{N} (x_i - \mu)^2 \]
coefficient_variation <- sqrt(sigma2)/mu # unitless
coefficient_variation
## [1] 0.03905931
SD_samp <- sd(population_dat$height)
SD_samp
## [1] 2.557757
# Notice :
SD_samp > sigma2 # FALSE
## [1] FALSE
\[ n=\frac{N\sigma^2}{ND+\sigma^2} \text{ For : }D=(\frac{B}{Z_{\alpha/2}})^2 \]
and suppose we wish to be 95% confident (1.96 \(\approx\) 2 ) with a margin of error of 1/2 inch
so :
\[ D = \frac{B^2}{4} \text{ For : } B =0.5 \text{ inch} \]
B <- .5
Z_crit <- qnorm(0.975) # exact
Z_crit
## [1] 1.959964
D <- (B/Z_crit)^2
D
## [1] 0.06507944
# round to whole number
n <- round((N*sigma2)/(N*D+sigma2),0)
n
## [1] 97
If you notices, i didnt use 2 to approximate but rather exact value
Mean finger length :
mu2 <- mean(population_dat$finger)
mu2
## [1] 115.4737
Notice we are using a finite-population and therefore should SRS-WOR :
?sample
index <- sample(
x = seq_along(population_dat$height),
size = n,
replace = FALSE
)
sample <- population_dat$height[index]
hist(sample)
hist(population_dat$height)
sample_mean <- mean(sample)
sample_mean
## [1] 65.50515
s <- sd(sample)
Recall the standard error for a sample average :
\[ S.E._{\bar{x}}=\frac{s}{\sqrt{n}}*\sqrt{FPC} \]
FPC <- (N-n)/(N-1)
FPC
## [1] 0.9679893
FPC > 1-n/N # Notice the true is bigger
## [1] TRUE
SE_xbar <- (s/sqrt(n))* sqrt(FPC)
SE_xbar
## [1] 0.2526885
ME <- SE_xbar*Z_crit
ci <- sample_mean + c(-1,1)*ME
ci
## [1] 65.00989 66.00041
# generate plt
plot(
x = ci,
y = c(1, 1),
type = "n",
xlab = "Mean",
ylab = "",
yaxt = "n",
main = "Confidence Interval for SRS"
)
# Add range of confidence
segments(
x0 = ci[1],
x1 = ci[2],
y0 = 1,
y1 = 1,
lwd = 3
)
# add pt for est
points(
x = sample_mean,
y = 1,
pch = 19
)
# add line for true mu
abline(
v = mu,
lty = 2,
col = "red"
)
okay, to get an idea of the relative efficiency of the estimator, we may graph the relationship of the auxiliary variable :
plot(population_dat$height, population_dat$finger)
plot(population_dat$height[index], population_dat$finger[index])
df <-
data.frame(height=population_dat$height[index], finger=population_dat$finger[index])
xbar_finger <- mean(df$finger)
xbar_finger
## [1] 116.1134
Recall :
\[ r = \frac{\bar{y}}{\bar{x}} \]
r <- mean(df$height)/mean(df$finger)
and we say :
\[ r*\mu_x =\hat{\mu}_{y} \]
ratio_est <- mean(population_dat$finger)*r
ratio_est
## [1] 65.14425
And to est the Standard deviation of ratio estimate :
\[ \text{Var}(\hat{\mu}_y)=\mu_{x}^2 \text{Var}(r) \]
\[ \text{For : }\text{Var}(r)=\frac{s^2_r}{n} \]
\[ \text{For : }s_r^2=\frac{\sum(y_i - r*x_i)^2}{n-1} \]
or,
\[ \text{FPC}*\text{Var}(r) \]
s_r2 <- (1/(n-1))*sum((df$height-(df$finger*r))^2)
s_r2
## [1] 5.540488
var_ratio_est <- FPC*(s_r2/n)
var_ratio_est
## [1] 0.05529003
SD_ratio_est <- sqrt(var_ratio_est)
SD_ratio_est
## [1] 0.2351383
ME_ratio <- Z_crit*SD_ratio_est
ci_ratio <- ratio_est + c(-1,1)*ME_ratio
# generate plot
plot(
x = ci_ratio,
y = c(1, 1),
type = "n",
xlab = "Ratio Estimate",
ylab = "",
yaxt = "n",
main = "Confidence Interval for Ratio Estimator"
)
# Add CI segment
segments(
x0 = ci_ratio[1],
x1 = ci_ratio[2],
y0 = 1,
y1 = 1,
lwd = 3
)
# Add point for ratio estimate
points(
x = ratio_est,
y = 1,
pch = 19
)
# Add vertical line for true value
abline(
v = mu,
lty = 2,
col = "red"
)
abs(mu - ratio_est)
## [1] 0.3287509
abs(mu - sample_mean)
## [1] 0.03215464
As we can see, our ratio est performed worse and was more off but the interval was smaller.
# Done by R
mdl <- lm(height~finger,dat=df)
mdl
##
## Call:
## lm(formula = height ~ finger, data = df)
##
## Coefficients:
## (Intercept) finger
## 30.3888 0.3024
Done by hand :
y <- df$height
x <- df$finger
ybar <- mean(y)
xbar <- mean(x)
# slope
b <- sum((y - ybar)*(x - xbar)) /
sum((x - xbar)^2)
b
## [1] 0.3024312
# intercept
a <- ybar - b*xbar
a
## [1] 30.38884
mu_reg_est <- ybar + b*(mean(population_dat$finger)
- xbar)
mu_reg_est
## [1] 65.31168
Estimate \(s^2_{yL}\)
MSE <- summary(mdl)$sigma^2
MSE
## [1] 3.013452
V_reg <- FPC*(MSE/n)
V_reg
## [1] 0.03007206
SE <- sqrt(V_reg)
SE
## [1] 0.173413
ME <- SE*Z_crit
ci_reg <- mu_reg_est+c(-1,1)*ME
ci_reg
## [1] 64.97180 65.65156
grep("^ci", ls(), value = T)
## [1] "ci" "ci_ratio" "ci_reg"
ci_SRS <- ci
ci_SRS
## [1] 65.00989 66.00041
ci_ratio
## [1] 64.68339 65.60511
ci_reg
## [1] 64.97180 65.65156
Okay, now time to plot them all togehter
# Combine all interval endpoints to control x-axis range
x_range <- range(c(ci_SRS, ci_ratio, ci_reg, mu))
# Create empty plot
plot(
x = x_range,
y = c(1, 3),
type = "n",
xlab = "Parameter Value",
ylab = "",
yaxt = "n",
main = "Comparison of Confidence Intervals"
)
# Add y-axis labels
axis(
side = 2,
at = c(1, 2, 3),
labels = c("SRS", "Ratio", "Regression"),
las = 1
)
# --- SRS CI ---
segments(
x0 = ci_SRS[1],
x1 = ci_SRS[2],
y0 = 1,
y1 = 1,
lwd = 3
)
# --- Ratio CI ---
segments(
x0 = ci_ratio[1],
x1 = ci_ratio[2],
y0 = 2,
y1 = 2,
lwd = 3
)
# --- Regression CI ---
segments(
x0 = ci_reg[1],
x1 = ci_reg[2],
y0 = 3,
y1 = 3,
lwd = 3
)
# Add vertical line for true mu
abline(
v = mu,
lty = 2,
col = "red",
lwd = 2
)