The allocated stratified sample for houses, apartments, and condos would be 504, 324, and 72 respectively.
Nh <- c(35000, 45000, 10000)
Rh <- c(2,1,1)
NhRh <- Nh*Rh
nh <- NhRh*900/(sum(NhRh))
nh
## [1] 504 324 72
\(\frac{V_{str}}{V_{srs}}= \frac{0.0002224456}{0.000232457} = 0.956932\)
The variance of stratified random sampling is smaller than that of simple random sampling which implies that stratified random sampling is different.
#Variance of Stratified Sample with Proportional Allocation
nh_Nh <- nh/Nh
1-nh_Nh
## [1] 0.9856 0.9928 0.9928
Nh_N <- Nh/90000
Nh_N^2
## [1] 0.15123457 0.25000000 0.01234568
pop_percent<-c(0.45,0.25,0.03)
(pop_percent*(1-pop_percent))/(nh-1)
## [1] 0.0004920477 0.0005804954 0.0004098592
v_str<- sum((1-nh_Nh)*(Nh_N^2)*(pop_percent*(1-pop_percent))/(nh-1))
v_str
## [1] 0.0002224456
#Variance of SRS
p <- sum(Nh*pop_percent)/90000
p #0.298333
## [1] 0.3033333
v_srs<-((90000-900)/(90000-1))*((p*(1-p))/900)
v_srs
## [1] 0.000232457
v_str/v_srs
## [1] 0.956932
An optimal allocation for a stratified sample of size 300 would be as follows: Northeast = 7 North Central = 69 South = 122 West = 102
library(SDaA)
n <- nrow(agsrs)
S2h <- c(7647472708, 29618183543, 53587487856, 396185950266)
sum(S2h)
## [1] 487039094373
Nh <- c(220,1054,1382,422)
sum(Nh)
## [1] 3078
NhSh <- sqrt(S2h)*Nh
sum(NhSh)
## [1] 786171197
(NhSh/sum(NhSh))*300 #Round to whole numbers = 7+69+122+102 = 300
## [1] 7.341516 69.218781 122.079817 101.359886
The estimate of total number of acres devoted to farming in the US is 939069579.
The standard error of this estimate is 51,870,217.
The standard error from the previous example is 50,417,248. I believe the standard error for optical allocation is higher in this case because we sampled NorthEast with a sample of size 7.
library(SDaA)
data("agpop")
N <- nrow(agpop)
head(agpop$region == "NE")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
NE_Y <- subset(agpop, select = acres92, subset = region == "NE", drop = TRUE )
NC_Y <- subset(agpop, select = acres92, subset = region == "NC", drop = TRUE )
S_Y <- subset(agpop, select = acres92, subset = region == "S", drop = TRUE)
W_Y <- subset(agpop, select = acres92, subset = region == "W", drop = T )
set.seed(1234)
NE_samp <- sample(NE_Y,7,FALSE)
set.seed(1234)
NC_samp <- sample(NC_Y,69,FALSE)
set.seed(1234)
S_samp <- sample(S_Y,122,FALSE)
set.seed(1234)
W_samp <- sample(W_Y, 102, FALSE)
y_bar_NE <- mean(NE_samp)
y_bar_NC <- mean(NC_samp)
y_bar_S <-mean(S_samp)
y_bar_W <-mean(W_samp)
var(NE_samp)
## [1] 4491620993
var(NC_samp)
## [1] 8.4444e+10
var(S_samp)
## [1] 29683315909
var(W_samp)
## [1] 7.29812e+11
Nh <- c(220,1054,1382,422)
nh <- c(7,69,122,102)
y_bars <- c(y_bar_NE,y_bar_NC,y_bar_S, y_bar_S, y_bar_W)
#Total Number of Acres devoted to farming
sum(c(Nh[1]*y_bar_NE, Nh[2]*y_bar_NC, Nh[3]*y_bar_S, Nh[4]*y_bar_W))
## [1] 941290382
#FPC
(1-(nh/Nh))
## [1] 0.9681818 0.9345351 0.9117221 0.7582938
Var_t <- sum((1-(nh[1]/Nh[1]))*(Nh[1]^2)*var(NE_samp)/nh[1], (1-(nh[2]/Nh[2]))*(Nh[2]^2)*var(NC_samp)/nh[2],
(1-(nh[3]/Nh[3]))*(Nh[3]^2)*var(S_samp)/nh[3],
(1-(nh[4]/Nh[4]))*(Nh[4]^2)*var(W_samp)/nh[4])
sqrt(Var_t)
## [1] 51870217
sqrt(13403701)
## [1] 3661.107
The number of respondents in each strata is shown in the response_count column.
data("winter")
Nh <- c(1374, 1960, 252, 95)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
w_t <- group_by(winter,class) %>% #Looking at entries of a specific class
summarise(response_count = sum(is.na(breakaga) != 1),
yes_count = sum(breakaga == 'yes', na.rm = 1),
no_count = sum(breakaga == 'no', na.rm = 1),
nonresponse = sum(is.na(breakaga) != 0),
yes_ratio = yes_count / response_count)
w_t['total'] <- Nh
w_t
## # A tibble: 4 x 7
## class response_count yes_count no_count nonresponse yes_ratio total
## <fct> <int> <int> <int> <int> <dbl> <dbl>
## 1 faculty 232 167 65 56 0.720 1374
## 2 classstaff 514 459 55 19 0.893 1960
## 3 admstaff 86 75 11 5 0.872 252
## 4 acprof 67 58 9 6 0.866 95
In part A, we calculated the proportion of faculty and staff that would answer ‘yes’ to the question and it can be found under the “yes_ratio” column.
The estimated proportion of faculty and staff that would answer “yes” to the question is 82.6%.
The standard error is 1.2%.
ratio_est <- sum((Nh / sum(Nh)) * w_t$yes_ratio)
ratio_est
## [1] 0.8262216
se_ratio <- ((1-(w_t$response_count/w_t$total))) * (((w_t$total/sum(w_t$total))^2)) * ((w_t$yes_ratio)*(1-w_t$yes_ratio)/(w_t$response_count-1))
sqrt(sum(se_ratio))
## [1] 0.01201918
The required estimate is 0.1737784
w_t['no_ratio'] <- 1 - w_t$yes_ratio
round(w_t$yes_ratio * w_t$total)
## [1] 989 1750 220 82
round(w_t$no_ratio * w_t$total)
## [1] 385 210 32 13
w_t['sampling_weights'] <-w_t$total/w_t$response_count
w_t
## # A tibble: 4 x 9
## class response_count yes_count no_count nonresponse yes_ratio total
## <fct> <int> <int> <int> <int> <dbl> <dbl>
## 1 facu… 232 167 65 56 0.720 1374
## 2 clas… 514 459 55 19 0.893 1960
## 3 adms… 86 75 11 5 0.872 252
## 4 acpr… 67 58 9 6 0.866 95
## # ... with 2 more variables: no_ratio <dbl>, sampling_weights <dbl>
sum(w_t$sampling_weights)
## [1] 14.08379
ratio_est_no <- sum((Nh / sum(Nh)) * w_t$no_ratio)
ratio_est_no
## [1] 0.1737784
The estimate of the proportion is 0.0120. The reason this is the same as part b as its just another algebraic expression representing the same response.
w_t['s2_h'] <- (w_t$response_count / (w_t$response_count - 1)) * w_t$yes_ratio * w_t$no_ratio
ratio_se_2 <- sqrt(sum(
(1 - (w_t$response_count / w_t$total)) * ((w_t$total / sum(w_t$total)) ** 2) *
(w_t$s2_h / w_t$response_count)))
ratio_se_2
## [1] 0.01201918
The stratum with the lowest response rate for this question is Faculty. The stratification assumes nonrespondeents have similar opinions as the individuals who are responding.
q_mailed <- c(500, 653, 74, 95)
(w_t$response_count/q_mailed)*100
## [1] 46.40000 78.71363 116.21622 70.52632
w_t
## # A tibble: 4 x 10
## class response_count yes_count no_count nonresponse yes_ratio total
## <fct> <int> <int> <int> <int> <dbl> <dbl>
## 1 facu… 232 167 65 56 0.720 1374
## 2 clas… 514 459 55 19 0.893 1960
## 3 adms… 86 75 11 5 0.872 252
## 4 acpr… 67 58 9 6 0.866 95
## # ... with 3 more variables: no_ratio <dbl>, sampling_weights <dbl>,
## # s2_h <dbl>
We would use Ratio Estimation and compare the ratio between the time spent num - broad cast time of sports denom - total broadcast time ### Part B. We would use Ratio Estimation num - total number of fish denom - total number of hours they were fishing ### Part C. Ask students how much they spent on books and divde by number of students. We would use Ratio Estimation. num - cost per book denom - number of books -Swarnali suggested using regression estimation where “x” would be number of courses. ### Part D. Take a sample of chickens, weight them all. Weigh the usable meat. Goal is to estimate the total weight. We would use Ratio Estimation num - Usable meat denom - Non-usable meat
t_x <- c(13,7,11,12,4,3,11,3,5)
sum(t_x)
## [1] 69
t_x_bar <- sum(t_x)/9
t_x_bar
## [1] 7.666667
t_y <- c(10,7,13,17,8,1,15,7,4)
sum(t_y)
## [1] 82
t_y_bar <- sum(t_y)/9
t_y_bar
## [1] 9.111111
xi_xbar <- (t_x - t_x_bar)^2
sx <- sqrt((1/8)*sum(xi_xbar))
sx
## [1] 4.092676
yi_ybar <- (t_y - t_y_bar)^2
sy <- sqrt((1/8)*sum(yi_ybar))
sy
## [1] 5.182771
xiyi <- sum((t_x - t_x_bar)*(t_y - t_y_bar))
R<-xiyi/(8*4.0927*5.1828)
R
## [1] 0.8151969
B<-sum(t_y)/sum(t_x)
B
## [1] 1.188406
x <- c(13, 7, 11, 12, 4, 3, 11, 3, 5)
y <- c(10, 7, 13, 17, 8, 1, 15, 7, 4)
t_x <- sum(x)
t_y <- sum(y)
s_x <- sd(x)
s_y <- sd(y)
B <- t_y / t_x
R <- 0
for (i in 1:length(x)) {
R <- R + ((x[i] - (t_x / length(x))) * (y[i] - (t_y / length(y))) / ((length(x) - 1) * s_x * s_y))
}
test <- c()
library(combinat)
##
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
##
## combn
combinations <- t(combn(c(1:9), 3))
for (i in 1:dim(combinations)[1]) {
test$N_ybar[i] <- 9 * mean(y[combinations[i,]])
test$B_hat[i] <- (mean(y[combinations[i,]]) / mean(x[combinations[i,]]))
test$t_y_ratio[i] <- t_x * test$B_hat[i]
test$x_bar[i] <- mean(x[combinations[i,]])
}
R
## [1] 0.8152062
test
## $N_ybar
## [1] 90 102 75 54 96 72 63 120 93 72 114 90 81 105 84 126 102
## [18] 93 57 99 75 66 78 54 45 96 87 63 111 84 63 105 81 72
## [35] 96 75 117 93 84 48 90 66 57 69 45 36 87 78 54 114 93
## [52] 135 111 102 66 108 84 75 87 63 54 105 96 72 78 120 96 87
## [69] 99 75 66 117 108 84 72 48 39 90 81 57 69 60 36 78
##
## $B_hat
## [1] 0.9677419 1.0625000 1.0416667 0.7826087 1.0322581 1.0434783 0.8400000
## [8] 1.1111111 1.1071429 0.8888889 1.0857143 1.1111111 0.9310345 1.2068966
## [15] 1.0000000 1.1666667 1.2142857 1.0333333 0.9500000 1.1785714 1.2500000
## [22] 1.0000000 0.9629630 0.9473684 0.7142857 1.1851852 1.0000000 1.0000000
## [29] 1.2333333 1.2727273 1.0000000 1.2068966 1.2857143 1.0434783 1.3913043
## [36] 1.1363636 1.3000000 1.4090909 1.1666667 1.1428571 1.3636364 1.5714286
## [43] 1.1875000 1.0952381 1.1538462 0.8000000 1.3809524 1.1304348 1.2000000
## [50] 1.4074074 1.1923077 1.3235294 1.4230769 1.2142857 1.2222222 1.3846154
## [57] 1.5555556 1.2500000 1.1600000 1.2352941 0.9473684 1.4000000 1.1851852
## [64] 1.2631579 1.3684211 1.4814815 1.6842105 1.3809524 1.2692308 1.3888889
## [71] 1.1000000 1.5000000 1.2857143 1.4000000 1.3333333 1.6000000 1.0833333
## [78] 1.6666667 1.3500000 1.5833333 1.3529412 1.0526316 1.0909091 1.3684211
##
## $t_y_ratio
## [1] 66.77419 73.31250 71.87500 54.00000 71.22581 72.00000 57.96000
## [8] 76.66667 76.39286 61.33333 74.91429 76.66667 64.24138 83.27586
## [15] 69.00000 80.50000 83.78571 71.30000 65.55000 81.32143 86.25000
## [22] 69.00000 66.44444 65.36842 49.28571 81.77778 69.00000 69.00000
## [29] 85.10000 87.81818 69.00000 83.27586 88.71429 72.00000 96.00000
## [36] 78.40909 89.70000 97.22727 80.50000 78.85714 94.09091 108.42857
## [43] 81.93750 75.57143 79.61538 55.20000 95.28571 78.00000 82.80000
## [50] 97.11111 82.26923 91.32353 98.19231 83.78571 84.33333 95.53846
## [57] 107.33333 86.25000 80.04000 85.23529 65.36842 96.60000 81.77778
## [64] 87.15789 94.42105 102.22222 116.21053 95.28571 87.57692 95.83333
## [71] 75.90000 103.50000 88.71429 96.60000 92.00000 110.40000 74.75000
## [78] 115.00000 93.15000 109.25000 93.35294 72.63158 75.27273 94.42105
##
## $x_bar
## [1] 10.333333 10.666667 8.000000 7.666667 10.333333 7.666667 8.333333
## [8] 12.000000 9.333333 9.000000 11.666667 9.000000 9.666667 9.666667
## [15] 9.333333 12.000000 9.333333 10.000000 6.666667 9.333333 6.666667
## [22] 7.333333 9.000000 6.333333 7.000000 9.000000 9.666667 7.000000
## [29] 10.000000 7.333333 7.000000 9.666667 7.000000 7.666667 7.666667
## [36] 7.333333 10.000000 7.333333 8.000000 4.666667 7.333333 4.666667
## [43] 5.333333 7.000000 4.333333 5.000000 7.000000 7.666667 5.000000
## [50] 9.000000 8.666667 11.333333 8.666667 9.333333 6.000000 8.666667
## [57] 6.000000 6.666667 8.333333 5.666667 6.333333 8.333333 9.000000
## [64] 6.333333 6.333333 9.000000 6.333333 7.000000 8.666667 6.000000
## [71] 6.666667 8.666667 9.333333 6.666667 6.000000 3.333333 4.000000
## [78] 6.000000 6.666667 4.000000 5.666667 6.333333 3.666667 6.333333
The histograms are fairly similar but the t_y_ratio is a bit more spread.
hist(test$t_y_ratio)
hist(test$N_ybar)
The mean and variance of t_y_ratio are 82.79241 and 14.03418.
The mean and variance of N_ybar are 82 and 22.1207
The bias of t_y_ratio is 0.089106 Using the
mean(test$t_y_ratio)
## [1] 82.79241
sd(test$t_y_ratio)
## [1] 14.03418
mean(test$N_ybar)
## [1] 82
sd(test$N_ybar)
## [1] 22.1207
-cov(test$B_hat, test$x_bar)
## [1] 0.08910597
The approximate bias of t_y_ratio is 0.07577157. The difference between the true bias and the approximate bias is 0.0133344.
bias_t_y_r <-(1-3/9)*(1/(3*7.6667))*(B*s_x^2 - R*s_x*s_y)
0.08910597- 0.07577157
## [1] 0.0133344
require(stats)
require(graphics)
trees
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
plot(trees$Girth, trees$Volume, ylab="Volume", xlab="Diameter", main =
"Cherry Trees")
The confidence interval is from (84548.15, 105996.2).
N = 2967
xbar <- mean(trees$Girth)
ybar <- mean(trees$Volume)
b <- ybar/xbar
b
## [1] 2.277331
t_y_ratio <- b*41835
t_y_ratio
## [1] 95272.16
trees
## Girth Height Volume
## 1 8.3 70 10.3
## 2 8.6 65 10.3
## 3 8.8 63 10.2
## 4 10.5 72 16.4
## 5 10.7 81 18.8
## 6 10.8 83 19.7
## 7 11.0 66 15.6
## 8 11.0 75 18.2
## 9 11.1 80 22.6
## 10 11.2 75 19.9
## 11 11.3 79 24.2
## 12 11.4 76 21.0
## 13 11.4 76 21.4
## 14 11.7 69 21.3
## 15 12.0 75 19.1
## 16 12.9 74 22.2
## 17 12.9 85 33.8
## 18 13.3 86 27.4
## 19 13.7 71 25.7
## 20 13.8 64 24.9
## 21 14.0 78 34.5
## 22 14.2 80 31.7
## 23 14.5 74 36.3
## 24 16.0 72 38.3
## 25 16.3 77 42.6
## 26 17.3 81 55.4
## 27 17.5 82 55.7
## 28 17.9 80 58.3
## 29 18.0 80 51.5
## 30 18.0 80 51.0
## 31 20.6 87 77.0
se_sqr <- 1/30*sum(((trees$Volume-(b*trees$Girth))**2))
se <- sqrt(se_sqr)
se
## [1] 9.698086
fpc <- (1-31/2967)
SE_t_y_ratio <- fpc*((41835/xbar)^2)*(se_sqr/31)
s <- sqrt(SE_t_y_ratio)
t_y_ratio + (1.96 * s)
## [1] 105996.2
t_y_ratio - (1.96 * s)
## [1] 84548.15