Q1:

  1. Ratio estimation: I would measure the time devoted to sports in a sample of news broadcasts and use the ratio of sports time to total time to estimate the proportion across all broadcasts.

  2. Regression estimation: I would use this by relating the number of fish caught to variables such as the angler’s type of bait used to predict the average catch per hour.

  3. Ratio estimation: I would do this by dividing the total textbook spending by the total number of students.

  4. Ratio estimation: I would measure the usable meat weight of a sample of chickens and calculate the ratio of usable to total weight.

Q3

library(ggplot2)
library(dplyr)
data <- data.frame(
  Diameter = c(12.0, 11.4, 7.9, 9.0, 10.5, 7.9, 7.3, 10.2, 11.7, 11.3,5.7, 8.0, 10.3, 12.0, 9.2, 8.5, 7.0, 10.7, 9.3, 8.2),
  Age = c(125, 119, 83, 85, 99, 117, 69, 133, 154, 168,61, 80, 114, 147, 122, 106, 82, 88, 97, 99)
)

#age vsdiameter 
ggplot(data, aes(x = Diameter, y = Age)) +
  geom_point() +
  labs(title = "Tree Age vs. Diameter",
       x = "Diameter in cm",
       y = "Age in years")

x_bar <- mean(data$Diameter)
y_bar <- mean(data$Age)

#polation mean
X_pop <- 10.3

#ratio estimation of the mean age
R_hat <- y_bar / x_bar
Y_hat_rat <- R_hat * X_pop
Y_hat_rat
## [1] 117.6204
residuals <- data$Age - R_hat * data$Diameter
S_e_squared <- sum(residuals^2) / (length(data$Age) - 1)

#ratio estimation standardd error
n <- nrow(data)
SE_1 <- sqrt((1 - n/1132)*(S_e_squared/n)*(X_pop^2/x_bar^2))
SE_1
## [1] 4.354872
#regression
model <- lm(Age ~ Diameter, data = data)

beta_0 <- coef(model)[1]
beta_1 <- coef(model)[2]
Y_hat_reg <- beta_0 + beta_1 * 10.3
Y_hat_reg
## (Intercept) 
##    118.3634
#regression estimation standard error
SE_regression <- summary(model)$sigma / sqrt(n) * sqrt(1 - n / 1132)
cat("Standard Error of Regression Estimate", SE_regression, "\n")
## Standard Error of Regression Estimate 4.070774
# ratio and reg estimates in graph
ggplot(data, aes(x = Diameter, y = Age)) +
  geom_point() +
  geom_hline(yintercept = Y_hat_rat, color = "blue") +
  geom_hline(yintercept = Y_hat_reg, color = "red", linetype = "dotted") +
  labs(title = "Scatterplot of Tree Age vs. Diameter with Estimates",
       x = "Diameter (cm)",
       y = "Age (years)")

Q10

data(agsrs)
#acres92 vs. farms87
plot(agsrs$farms87, agsrs$acres92, 
     xlab = "Number of Farms in 1987", 
     ylab = "Acres Devoted to Farming in 1992", 
     main = "Relationship between Number of Farms (1987) and Acres Farmed (1992)")

farms_tot_87 <- 2087759
sample_ratio <- sum(agsrs$acres92) / sum(agsrs$farms87)

#ratio estimate
rat_est <- sample_ratio * farms_tot_87
rat_est
## [1] 960155061
tot_farms87 <- 2087759
reg_model <- lm(acres92 ~ farms87, data = agsrs)

betaa_0 <- coef(reg_model)[1]
betaa_1 <- coef(reg_model)[2]

reg_est <- (tot_farms87 * betaa_1) + (300 * betaa_0)
reg_est
##   farms87 
## 179597441

To find out which method Gives the most Precision, I have to find the Standard Error of Both Estimates:

var_y <- var(agsrs$acres92)
var_x <- var(agsrs$farms87)

se_ratio <- (960155061)*sqrt((1/300)* (var_y/var_x) * (1 - (300/2087759)))

residuals <- residuals(reg_model)
se_regression <- sqrt(sum(residuals^2) / (300 - 2))
se_total_regression <- 179597441 * se_regression

cat("Standard Error for Ratio Estimate: ", se_ratio, "\n")
## Standard Error for Ratio Estimate:  44284844226
cat("Standard Error for Reg Estimate: ", se_total_regression, "\n")
## Standard Error for Reg Estimate:  6.187402e+13

With our output, we see the ratio estimate has the lower standard error, meaning this method gives the most precision.

Q17

mean_lead <- 127
sd_lead <- 146
n_lead <- 121

se_lead <- sd_lead / sqrt(n_lead)
ci_lead_lower <- mean_lead - 1.98 * se_lead
ci_lead_upper <- mean_lead + 1.98 * se_lead

mean_copper <- 35
sd_copper <- 16
n_copper <- 121

se_copper <- sd_copper / sqrt(n_copper)
ci_copper_lower <- mean_copper - 1.98 * se_copper
ci_copper_upper <- mean_copper + 1.98 * se_copper

cat("Confidence Interval for Lead: ", "[",ci_lead_lower, ",",ci_lead_upper, "]", "\n")
## Confidence Interval for Lead:  [ 100.72 , 153.28 ]
cat("Confidence Interval for copper: ", "[",ci_copper_lower, ",",ci_copper_upper, "]")
## Confidence Interval for copper:  [ 32.12 , 37.88 ]
n_lead_total <- 82 + 31 + 8

weighted_mean_lead <- (82 * 71 + 31 * 259 + 8 * 189) / n_lead_total

var_A_lead <- (28^2) / 82
var_B_lead <- (232^2) / 31
var_C_lead <- (79^2) / 8

var_lead_strat <- var_A_lead + var_B_lead + var_C_lead

se_lead_strat <- sqrt(var_lead_strat)

ci_lead_strat_lower <- weighted_mean_lead - 1.98 * se_lead_strat
ci_lead_strat_upper <- weighted_mean_lead + 1.98 * se_lead_strat

cat("Poststratified Sample 95% CI for Lead:", ci_lead_strat_lower, "to", ci_lead_strat_upper, "\n")
## Poststratified Sample 95% CI for Lead: 27.45458 to 226.4793
n_total <- 82 + 31 + 8
mean_weighted <- (82 * 28 + 31 * 50 + 8 * 45) / n_total

var_A <- (9^2) / 82
var_B <- (18^2) / 31
var_C <- (15^2) / 8

total_variance <- (82^2 / n_total^2) * var_A + (31^2 / n_total^2) * var_B + (8^2 / n_total^2) * var_C

# se
se_stratified <- sqrt(total_variance)

df_stratified <- n_total - 1

t_value_stratified <- qt(0.975, df_stratified)

ci_lower_stratified <- mean_weighted - t_value_stratified * se_stratified
ci_upper_stratified <- mean_weighted + t_value_stratified * se_stratified

cat("Poststratified Sample 95% CI for Copper:", ci_lower_stratified, "to", ci_upper_stratified, "\n")
## Poststratified Sample 95% CI for Copper: 32.53556 to 36.98511

The poststratified CI for Lead is larger than the regular CI

The poststratified CI for Copper is narrower compared to the regular CI

Yes, i believe using stratification in future surveys would increae precision.

Q40

data(agpop)
agpop$farms92 <- factor(agpop$farms92)

n_srs <- 500
n <- 40
y_values <- numeric(n_srs)
ypost_values <- numeric(n_srs)

Q42

Case Fatality Rate: This rate focuses on the proportion of confirmed cases that result in death and is used to measure the lethality of a disease among people diagnosed with it.

Use of Ratio Estimation: We would use a ratio of deaths to confirmed cases, estimating the lethality of a disease among diagnosed individuals.

Sampling Error: In some countries, testing could have biases which could result in an underreporting or overreporting of the number of confirmed cases.

Measurement Error: There may be errors in diagnosis as some deaths may not be linked to the disease.

Selection Bias: If only hospitalized patients are included in the confirmed cases, this may exclude milder cases of the disease, inflating the CFR for the population with severe disease.

Population Mortality Rate: This rate gives an estimate of the likelihood of dying from a particular disease for the entire population.

Use of Ratio Estimation: Population Mortality Rate is a ratio of deaths to the total population, estimating the likelihood of death for the entire population.

Sampling Error: If the data on deaths is sampled, it may not represent the entire population accurately.

Measurement Error: Errors can arise from classifying certain deaths as something else.

Selection Bias: If the population data isn’t representative of certain groups or regions, it can bias the mortality rate.

Q45B

data(baseball)

total_homeruns <- sum(baseball$homerun)
total_runs <- sum(baseball$run)

baseball_ratio <- total_homeruns / total_runs
cat("Ratio:", baseball_ratio, "\n")
## Ratio: 0.2363149
set.seed(123)
ratios <- numeric(1000)

for (i in 1:1000) {
  sample_data <- baseball[sample(1:nrow(baseball), replace = TRUE), ]
  
  total_hr <- sum(sample_data$homerun)
  total_r <- sum(sample_data$run)
  ratios[i] <- total_hr / total_r
}

ci_lower <- quantile(ratios, 0.025)
ci_upper <- quantile(ratios, 0.975)
cat("Confidence Interval for Ratio: [", ci_lower, ",", ci_upper, "]\n")
## Confidence Interval for Ratio: [ 0.2233592 , 0.2489921 ]