Histogram below; data is right skewed
library(SDaA)
n <- nrow(counties)
head(counties)
## rn state county landarea totpop physicia enroll percpub
## 1 27 AL Escambia 948 36023 24 6931 95.4
## 2 48 AL Marshall 567 73524 44 11928 98.6
## 3 85 AK Prince of Wales 7325 6408 7 1317 98.6
## 4 126 AR Cross 616 19261 11 4066 99.1
## 5 158 AR Newton 823 7649 3 1579 99.2
## 6 186 CA Butte 1640 188377 327 27899 94.5
## civlabor unemp farmpop numfarm farmacre fedgrant fedciv milit veterans
## 1 15247 1339 531 414 90646 122.3 85 370 3723
## 2 38803 3189 1592 1582 136599 235.7 316 748 8510
## 3 2787 383 71 2 214 32.2 126 63 809
## 4 8336 704 762 492 339830 81.4 87 107 1505
## 5 3280 270 600 562 98106 31.7 71 44 807
## 6 77500 7303 2818 2030 494530 688.0 570 577 23958
## percviet
## 1 27.1
## 2 29.1
## 3 44.6
## 4 23.9
## 5 25.5
## 6 27.6
#Part A
hist(counties$physicia, main="Histogram of Number of Physicians for the 100 Counties", xlab="Physicians")
\(N_\bar{y}\) = 933411 which is an estimate of the total number of physicians in the US. The standard error is 491982.8
y_bar <- mean(counties$physicia)
N <- 3141
N*y_bar
## [1] 933411
fpc <- 1-(100/N)
s_sqr <- var(counties$physicia)
N*sqrt(fpc*(s_sqr/100))
## [1] 491982.8
I would argue that ratio estimation would be better to use here as the a line through the points would generally go through the origin.
plot(counties$physicia, counties$totpop, xlab="Number of Physicians", ylab="Total Pop for each County", pch=16)
ybar <- mean(counties$physicia)
xbar <- mean(counties$totpop)
B_hat <- ybar/xbar
plot(counties$physicia, counties$totpop, pch = 16)
abline(a=0,b=B_hat, col = "red", lwd = 3)
#Should we transform data? I emailed you and asked and didn't respond so I left it as is.
fit <- lm(counties$physicia ~ log(counties$totpop))
plot(fit)
#Ratio Estimation
B_hat
## [1] 0.002507104
tx <- 255077536
t_hat_yr <- B_hat*tx
t_hat_yr #total number of physicians
## [1] 639506
se_inner <- sum((counties$physicia - B_hat*counties$totpop)^2)
se_sqr <- 1/(99)*se_inner
se <- sqrt(se_sqr)
se #Standard error of the ratio estimator
## [1] 415.0517
#Standard error of ratio estimator of the total population
sqrt(fpc)*(tx/mean(counties$totpop))*(se/10)
## [1] 87885.27
#Regression Estimation
x_bar_u <- tx/N
fit <- lm(counties$physicia ~ counties$totpop)
summary(fit)
##
## Call:
## lm(formula = counties$physicia ~ counties$totpop)
##
## Residuals:
## Min 1Q Median 3Q Max
## -767.24 -43.44 4.84 32.93 3105.60
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.423e+01 3.490e+01 -1.554 0.123
## counties$totpop 2.965e-03 6.519e-05 45.477 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 340.3 on 98 degrees of freedom
## Multiple R-squared: 0.9548, Adjusted R-squared: 0.9543
## F-statistic: 2068 on 1 and 98 DF, p-value: < 2.2e-16
b_0 <- -54.23
b_1 <- 0.00296
y_bar_hat_reg <- b_0 + b_1*x_bar_u
y_bar_hat_reg
## [1] 186.1487
y_hat_reg <- y_bar_hat_reg*N
y_hat_reg
## [1] 584693.1
sqrt((fpc)*(1/100)*(1154644.1)*(1-0.955)) #SE y bar hat reg
## [1] 22.42872
se_y_reg <- N*33.316
se_y_reg #SE of y hat reg
## [1] 104645.6
From the above calculations using Ratio and Regression estimation. Both methods lead to small standard errors, but ratio estimation has a smaller standard error. However, regression estimation is closer to the true value of physicians which is 532,638.
The population mean \(\bar{y}_{post}\) = 295560.8
The variance = 267953512 and the SE = 16369.29 The 95% confidence interval is (263477,327644.6). This method provides a narrower confidence interval compared to the CI from SRS which is (260706,335088). I would argue that this would poststratification would be the better sampling method to use.
library(SDaA)
data("agpop")
N <- nrow(agpop)
n <- 300
Nh <- c(220, 1054, 1382, 422)
nh <- c(21,103,135,41)
y_bar_nh <- c(97629.8, 300504.2, 211315, 662295.5)
var_nh <- c(7647472708,29618183543,53587487856,396185950266)
#Population Mean
y_bar_post <- (Nh[1]/N)*(y_bar_nh[1]) + (Nh[2]/N)*(y_bar_nh[2]) + (Nh[3]/N)*(y_bar_nh[3]) + (Nh[4]/N)*(y_bar_nh[4])
y_bar_post
## [1] 295560.8
#Variance
fpc <- (1-n/N)
var_ybar_post <- fpc*((Nh[1]/N)*(var_nh[1])/n + (Nh[2]/N)*(var_nh[2])/n + (Nh[3]/N)*(var_nh[3])/n + (Nh[4]/N)*(var_nh[4])/n)
var_ybar_post
## [1] 267953512
se_ybar_post <- sqrt(var_ybar_post)
se_ybar_post
## [1] 16369.29
#Confidence Interval
CI_lower <- y_bar_post - 1.96*se_ybar_post
CI_lower
## [1] 263477
CI_upper <- y_bar_post + 1.96*se_ybar_post
CI_upper
## [1] 327644.6
Our estimate of total words in the dictionary is 16443.63 with a SE of 705.0922. Our estimate of total words Kajal knows is 14953.2 with a SE of 663.4724.
The percentage of the words I know 90.93611%.
#Total Number of pages = 733 Total Pages
#Easier English -Intermediate Dictionary Second Edition
set.seed(1234)
page_num <- sample(1:733, 30, replace=F)
sort(page_num)
## [1] 7 29 33 84 114 134 156 166 169 192 204 206 211 216 226 372 373
## [18] 394 446 456 467 483 502 574 587 602 628 646 665 732
#Data Collection - THIS TOOK FOREVER
x <- c(15, 30, 26, 24, 27, 22, 19, 21, 23, 24, 22, 27, 21, 26, 19, 21, 19, 25, 19, 24, 25, 34, 4, 18, 24, 21, 17, 21, 26, 29)
y <- c(12, 27, 26, 22, 26, 20, 18, 18, 17, 23, 17, 22, 19, 22, 18, 20, 17, 24, 19, 24, 22, 29, 3, 18, 21, 20, 16, 21, 23, 28)
N <- 733
n <- 30
x_bar <- sum(x)/n
x_bar
## [1] 22.43333
y_bar <- sum(y)/n
y_bar
## [1] 20.4
#Estimate of total number of words on the pages
N*x_bar
## [1] 16443.63
#Estimate of total number of words that are known by Kajal
N*y_bar
## [1] 14953.2
#Standard Error for the total number of words that are on the pages
fpc <- 1-(30/733)
N*sqrt(fpc*(var(x)/30))
## [1] 705.0922
#Standard Error for the total number of words Kajal knows from pages
N*sqrt(fpc*(var(y)/30))
## [1] 663.4724
#Estimate of percentage of known words
(N*y_bar)/(N*x_bar)
## [1] 0.9093611
No, these estimates are not valid. Individuals in the same household or residence will most likely think alike. It is probably better to cluser based on households than to perform another sampling method.
We are attempting to find the percentage of children that have access to guns in their houses.
This is a cluster sample because there are two layers of sampling that are occuring. The first layer, or the PSU (Primary Sampling Unit) is the selected clinic we are sampling. The second layer, or the SSU (Secondary Sampling Unit) is the parents. This is one stage sampling, because you are sampling PSU and then treating the SSUs like a census.
We would estimate the percentage of households with guns by using the formula
\(\hat{p}\) = \(\sum\frac{m_i\hat{p_c}}{m_o}\)
Here, \(m_o\) = \(\sum m_i\) where \(m_i\) is the number of secondary sampling units in the sample from PSU i. The standard error formula can be found in the book.
The sampling population for this study is the number of households. This sampling method is not a representative sample of households with children parents who have more sick kids may be neglectful parents and may also have guns. That is a strong assumption to make, but it could affect the “representativeness” of the sample.
The estimated total number of worm fragments is \(\hat{t}_{unb}\) = 50653.33.
\(s^2_t\) = 2611.879
The variance of the unbiased estimator is 72145966 and the standard error is 8493.878.
The 95% confidence interval for the estimated total number of worm fragments is (34005.33,67301.33).
\(\hat{V}_{WR}\) = 73219669 is the estimator of variane for a cluster sample with replacement.
When we compare the variances, we see that the ratio is 0.9853359, suggesting they are very similar. The 95% CI for \(\hat{V}_{WR}\) is (33881.91, 67424.76) which is similar to the 95% confidence interval for \(\hat{t}_{unb}\).
N <- 580
n <- 12
M <- 24
m <- 3
can_o_worms <- c(1, 4, 0, 3, 4, 0, 5, 3, 7, 3, 4, 0, 5, 2, 1, 6, 9, 7, 5, 0, 3, 1, 7, 0, 7, 4, 2, 6, 8, 3, 1, 2, 5, 4, 9, 0)
worm_matrix <- matrix(can_o_worms, nrow = 3, byrow = 1)
worm_matrix
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,] 1 4 0 3 4 0 5 3 7 3 4 0
## [2,] 5 2 1 6 9 7 5 0 3 1 7 0
## [3,] 7 4 2 6 8 3 1 2 5 4 9 0
#Total Number of Worm Fragments
w_ij <- (N * M) / (n * m)
t_hat_unb <- w_ij * sum(can_o_worms)
t_hat_unb
## [1] 50653.33
#Two Stage Sampling Variance
t_i <- M * colMeans(worm_matrix)
s2_t <- 1 / (n - 1) * sum((t_i - t_hat_unb / N) ^ 2)
s2_i <- apply(worm_matrix, 2, var)
v_hat_t_hat_unb <- (N^2) * (1 - n / N) * (s2_t / n) + (N / n) * sum((1 - m / M) * (M^2) * (s2_i / m))
v_hat_t_hat_unb
## [1] 72145966
sqrt(v_hat_t_hat_unb)
## [1] 8493.878
#Confidence Interval 95%
t_hat_unb + 1.96*sqrt(v_hat_t_hat_unb)
## [1] 67301.33
t_hat_unb - 1.96*sqrt(v_hat_t_hat_unb)
## [1] 34005.33
#V_WR From Book
v_WR <- (N^2) * (s2_t / n)
v_WR
## [1] 73219669
#Comparison
#95% CI V WR
t_hat_unb + 1.96*sqrt(v_WR)
## [1] 67424.76
t_hat_unb - 1.96*sqrt(v_WR)
## [1] 33881.91
#Comparision of SAS Vwr and Vt_hat_unb
v_hat_t_hat_unb/v_WR
## [1] 0.9853359