Chapter 3 Excercise 6

Part A.

The allocated stratified sample for houses, apartments, and condos would be 504, 324, and 72 respectively.

Nh <- c(35000, 45000, 10000)
Rh <- c(2,1,1)
NhRh <- Nh*Rh
nh <- NhRh*900/(sum(NhRh))
nh
## [1] 504 324  72

Part B.

\(\frac{V_{str}}{V_{srs}}= \frac{0.0002224456}{0.000232457} = 0.956932\)

The variance of stratified random sampling is smaller than that of simple random sampling which implies that stratified random sampling is different.

#Variance of Stratified Sample with Proportional Allocation 
nh_Nh <- nh/Nh
1-nh_Nh
## [1] 0.9856 0.9928 0.9928
Nh_N <- Nh/90000
Nh_N^2
## [1] 0.15123457 0.25000000 0.01234568
pop_percent<-c(0.45,0.25,0.03)
(pop_percent*(1-pop_percent))/(nh-1)
## [1] 0.0004920477 0.0005804954 0.0004098592
v_str<- sum((1-nh_Nh)*(Nh_N^2)*(pop_percent*(1-pop_percent))/(nh-1))
v_str
## [1] 0.0002224456
#Variance of SRS
p <- sum(Nh*pop_percent)/90000
p #0.298333
## [1] 0.3033333
v_srs<-((90000-900)/(90000-1))*((p*(1-p))/900)
v_srs
## [1] 0.000232457
v_str/v_srs
## [1] 0.956932

Chapter 3 Excercise 12

An optimal allocation for a stratified sample of size 300 would be as follows: Northeast = 7 North Central = 69 South = 122 West = 102

library(SDaA)
n <- nrow(agsrs)

S2h <- c(7647472708, 29618183543, 53587487856, 396185950266)
sum(S2h)
## [1] 487039094373
Nh <- c(220,1054,1382,422)
sum(Nh)
## [1] 3078
NhSh <- sqrt(S2h)*Nh
sum(NhSh)
## [1] 786171197
(NhSh/sum(NhSh))*300 #Round to whole numbers = 7+69+122+102 = 300
## [1]   7.341516  69.218781 122.079817 101.359886

Chapter 3 Problem 13

The estimate of total number of acres devoted to farming in the US is 939069579.

The standard error of this estimate is 51,870,217.

The standard error from the previous example is 50,417,248. I believe the standard error for optical allocation is higher in this case because we sampled NorthEast with a sample of size 7.

library(SDaA)
data("agpop")
N <- nrow(agpop)

head(agpop$region == "NE")
## [1] FALSE FALSE FALSE FALSE FALSE FALSE
NE_Y <- subset(agpop, select = acres92, subset = region == "NE", drop = TRUE )

NC_Y <- subset(agpop, select = acres92, subset = region == "NC", drop = TRUE ) 

S_Y <- subset(agpop, select = acres92, subset = region == "S", drop = TRUE)

W_Y <- subset(agpop, select = acres92, subset = region == "W", drop = T )

set.seed(1234)
NE_samp <- sample(NE_Y,7,FALSE)
set.seed(1234)
NC_samp <- sample(NC_Y,69,FALSE)
set.seed(1234)
S_samp <- sample(S_Y,122,FALSE)
set.seed(1234)
W_samp <- sample(W_Y, 102, FALSE)

y_bar_NE <- mean(NE_samp)
y_bar_NC <- mean(NC_samp)
y_bar_S <-mean(S_samp)
y_bar_W <-mean(W_samp)

var(NE_samp)
## [1] 4491620993
var(NC_samp)
## [1] 8.4444e+10
var(S_samp)
## [1] 29683315909
var(W_samp)
## [1] 7.29812e+11
Nh <- c(220,1054,1382,422)
nh <- c(7,69,122,102)
y_bars <- c(y_bar_NE,y_bar_NC,y_bar_S, y_bar_S, y_bar_W)

#Total Number of Acres devoted to farming
sum(c(Nh[1]*y_bar_NE, Nh[2]*y_bar_NC, Nh[3]*y_bar_S, Nh[4]*y_bar_W))
## [1] 941290382
#FPC
(1-(nh/Nh))
## [1] 0.9681818 0.9345351 0.9117221 0.7582938
Var_t <- sum((1-(nh[1]/Nh[1]))*(Nh[1]^2)*var(NE_samp)/nh[1], (1-(nh[2]/Nh[2]))*(Nh[2]^2)*var(NC_samp)/nh[2],
(1-(nh[3]/Nh[3]))*(Nh[3]^2)*var(S_samp)/nh[3],
(1-(nh[4]/Nh[4]))*(Nh[4]^2)*var(W_samp)/nh[4])


sqrt(Var_t)
## [1] 51870217
sqrt(13403701)
## [1] 3661.107

Chapter 3 Problem 19 - Worked with Grady F.

Part A.

The number of respondents in each strata is shown in the response_count column.

data("winter")
Nh <- c(1374, 1960, 252, 95)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
w_t <- group_by(winter,class) %>% #Looking at entries of a specific class
      summarise(response_count = sum(is.na(breakaga) != 1),
                yes_count = sum(breakaga == 'yes', na.rm = 1),
                no_count = sum(breakaga == 'no', na.rm = 1),
                nonresponse = sum(is.na(breakaga) != 0),
                yes_ratio = yes_count / response_count)
      
w_t['total'] <- Nh
w_t
## # A tibble: 4 x 7
##   class      response_count yes_count no_count nonresponse yes_ratio total
##   <fct>               <int>     <int>    <int>       <int>     <dbl> <dbl>
## 1 faculty               232       167       65          56     0.720  1374
## 2 classstaff            514       459       55          19     0.893  1960
## 3 admstaff               86        75       11           5     0.872   252
## 4 acprof                 67        58        9           6     0.866    95

Part B.

In part A, we calculated the proportion of faculty and staff that would answer ‘yes’ to the question and it can be found under the “yes_ratio” column.

The estimated proportion of faculty and staff that would answer “yes” to the question is 82.6%.

The standard error is 1.2%.

ratio_est <- sum((Nh / sum(Nh)) * w_t$yes_ratio)
ratio_est
## [1] 0.8262216
se_ratio <- ((1-(w_t$response_count/w_t$total))) * (((w_t$total/sum(w_t$total))^2)) * ((w_t$yes_ratio)*(1-w_t$yes_ratio)/(w_t$response_count-1))

sqrt(sum(se_ratio))
## [1] 0.01201918

Part C.

The required estimate is 0.1737784

w_t['no_ratio'] <- 1 - w_t$yes_ratio
round(w_t$yes_ratio * w_t$total)
## [1]  989 1750  220   82
round(w_t$no_ratio * w_t$total)
## [1] 385 210  32  13
w_t['sampling_weights'] <-w_t$total/w_t$response_count
w_t
## # A tibble: 4 x 9
##   class response_count yes_count no_count nonresponse yes_ratio total
##   <fct>          <int>     <int>    <int>       <int>     <dbl> <dbl>
## 1 facu…            232       167       65          56     0.720  1374
## 2 clas…            514       459       55          19     0.893  1960
## 3 adms…             86        75       11           5     0.872   252
## 4 acpr…             67        58        9           6     0.866    95
## # ... with 2 more variables: no_ratio <dbl>, sampling_weights <dbl>
sum(w_t$sampling_weights)
## [1] 14.08379
ratio_est_no <- sum((Nh / sum(Nh)) * w_t$no_ratio)
ratio_est_no
## [1] 0.1737784

Part D.

The estimate of the proportion is 0.0120. The reason this is the same as part b as its just another algebraic expression representing the same response.

w_t['s2_h'] <- (w_t$response_count / (w_t$response_count - 1)) * w_t$yes_ratio * w_t$no_ratio
ratio_se_2 <- sqrt(sum(
(1 - (w_t$response_count / w_t$total)) * ((w_t$total / sum(w_t$total)) ** 2) * 
(w_t$s2_h / w_t$response_count)))
ratio_se_2
## [1] 0.01201918

Part E.

The stratum with the lowest response rate for this question is Faculty. The stratification assumes nonrespondeents have similar opinions as the individuals who are responding.

q_mailed <- c(500, 653, 74, 95)
(w_t$response_count/q_mailed)*100
## [1]  46.40000  78.71363 116.21622  70.52632
w_t
## # A tibble: 4 x 10
##   class response_count yes_count no_count nonresponse yes_ratio total
##   <fct>          <int>     <int>    <int>       <int>     <dbl> <dbl>
## 1 facu…            232       167       65          56     0.720  1374
## 2 clas…            514       459       55          19     0.893  1960
## 3 adms…             86        75       11           5     0.872   252
## 4 acpr…             67        58        9           6     0.866    95
## # ... with 3 more variables: no_ratio <dbl>, sampling_weights <dbl>,
## #   s2_h <dbl>

Chapter 4 Part 1

Part A.

We would use Ratio Estimation and compare the ratio between the time spent num - broad cast time of sports denom - total broadcast time ### Part B. We would use Ratio Estimation num - total number of fish denom - total number of hours they were fishing ### Part C. Ask students how much they spent on books and divde by number of students. We would use Ratio Estimation. num - cost per book denom - number of books -Swarnali suggested using regression estimation where “x” would be number of courses. ### Part D. Take a sample of chickens, weight them all. Weigh the usable meat. Goal is to estimate the total weight. We would use Ratio Estimation num - Usable meat denom - Non-usable meat

Chapter 4 Part 2.

Part A.

t_x <- c(13,7,11,12,4,3,11,3,5)
sum(t_x)
## [1] 69
t_x_bar <-  sum(t_x)/9
t_x_bar
## [1] 7.666667
t_y <- c(10,7,13,17,8,1,15,7,4)
sum(t_y)
## [1] 82
t_y_bar <- sum(t_y)/9
t_y_bar
## [1] 9.111111
xi_xbar <- (t_x - t_x_bar)^2
sx <- sqrt((1/8)*sum(xi_xbar))
sx
## [1] 4.092676
yi_ybar <- (t_y - t_y_bar)^2
sy <- sqrt((1/8)*sum(yi_ybar))
sy
## [1] 5.182771
xiyi <- sum((t_x - t_x_bar)*(t_y - t_y_bar))
R<-xiyi/(8*4.0927*5.1828)
R
## [1] 0.8151969
B<-sum(t_y)/sum(t_x)
B
## [1] 1.188406

Part B.

x <- c(13, 7, 11, 12, 4, 3, 11, 3, 5)
y <- c(10, 7, 13, 17, 8, 1, 15, 7, 4)
t_x <- sum(x)
t_y <- sum(y)
s_x <- sd(x)
s_y <- sd(y)
B <- t_y / t_x

R <- 0
for (i in 1:length(x)) {
R <- R + ((x[i] - (t_x / length(x))) * (y[i] - (t_y / length(y))) / ((length(x) - 1) * s_x * s_y))
}
test <- c()
library(combinat)
## 
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
## 
##     combn
combinations <- t(combn(c(1:9), 3))
for (i in 1:dim(combinations)[1]) {
test$N_ybar[i] <- 9 * mean(y[combinations[i,]]) 
test$B_hat[i] <- (mean(y[combinations[i,]]) / mean(x[combinations[i,]]))
test$t_y_ratio[i] <- t_x * test$B_hat[i]
test$x_bar[i] <- mean(x[combinations[i,]])
}
R
## [1] 0.8152062
test
## $N_ybar
##  [1]  90 102  75  54  96  72  63 120  93  72 114  90  81 105  84 126 102
## [18]  93  57  99  75  66  78  54  45  96  87  63 111  84  63 105  81  72
## [35]  96  75 117  93  84  48  90  66  57  69  45  36  87  78  54 114  93
## [52] 135 111 102  66 108  84  75  87  63  54 105  96  72  78 120  96  87
## [69]  99  75  66 117 108  84  72  48  39  90  81  57  69  60  36  78
## 
## $B_hat
##  [1] 0.9677419 1.0625000 1.0416667 0.7826087 1.0322581 1.0434783 0.8400000
##  [8] 1.1111111 1.1071429 0.8888889 1.0857143 1.1111111 0.9310345 1.2068966
## [15] 1.0000000 1.1666667 1.2142857 1.0333333 0.9500000 1.1785714 1.2500000
## [22] 1.0000000 0.9629630 0.9473684 0.7142857 1.1851852 1.0000000 1.0000000
## [29] 1.2333333 1.2727273 1.0000000 1.2068966 1.2857143 1.0434783 1.3913043
## [36] 1.1363636 1.3000000 1.4090909 1.1666667 1.1428571 1.3636364 1.5714286
## [43] 1.1875000 1.0952381 1.1538462 0.8000000 1.3809524 1.1304348 1.2000000
## [50] 1.4074074 1.1923077 1.3235294 1.4230769 1.2142857 1.2222222 1.3846154
## [57] 1.5555556 1.2500000 1.1600000 1.2352941 0.9473684 1.4000000 1.1851852
## [64] 1.2631579 1.3684211 1.4814815 1.6842105 1.3809524 1.2692308 1.3888889
## [71] 1.1000000 1.5000000 1.2857143 1.4000000 1.3333333 1.6000000 1.0833333
## [78] 1.6666667 1.3500000 1.5833333 1.3529412 1.0526316 1.0909091 1.3684211
## 
## $t_y_ratio
##  [1]  66.77419  73.31250  71.87500  54.00000  71.22581  72.00000  57.96000
##  [8]  76.66667  76.39286  61.33333  74.91429  76.66667  64.24138  83.27586
## [15]  69.00000  80.50000  83.78571  71.30000  65.55000  81.32143  86.25000
## [22]  69.00000  66.44444  65.36842  49.28571  81.77778  69.00000  69.00000
## [29]  85.10000  87.81818  69.00000  83.27586  88.71429  72.00000  96.00000
## [36]  78.40909  89.70000  97.22727  80.50000  78.85714  94.09091 108.42857
## [43]  81.93750  75.57143  79.61538  55.20000  95.28571  78.00000  82.80000
## [50]  97.11111  82.26923  91.32353  98.19231  83.78571  84.33333  95.53846
## [57] 107.33333  86.25000  80.04000  85.23529  65.36842  96.60000  81.77778
## [64]  87.15789  94.42105 102.22222 116.21053  95.28571  87.57692  95.83333
## [71]  75.90000 103.50000  88.71429  96.60000  92.00000 110.40000  74.75000
## [78] 115.00000  93.15000 109.25000  93.35294  72.63158  75.27273  94.42105
## 
## $x_bar
##  [1] 10.333333 10.666667  8.000000  7.666667 10.333333  7.666667  8.333333
##  [8] 12.000000  9.333333  9.000000 11.666667  9.000000  9.666667  9.666667
## [15]  9.333333 12.000000  9.333333 10.000000  6.666667  9.333333  6.666667
## [22]  7.333333  9.000000  6.333333  7.000000  9.000000  9.666667  7.000000
## [29] 10.000000  7.333333  7.000000  9.666667  7.000000  7.666667  7.666667
## [36]  7.333333 10.000000  7.333333  8.000000  4.666667  7.333333  4.666667
## [43]  5.333333  7.000000  4.333333  5.000000  7.000000  7.666667  5.000000
## [50]  9.000000  8.666667 11.333333  8.666667  9.333333  6.000000  8.666667
## [57]  6.000000  6.666667  8.333333  5.666667  6.333333  8.333333  9.000000
## [64]  6.333333  6.333333  9.000000  6.333333  7.000000  8.666667  6.000000
## [71]  6.666667  8.666667  9.333333  6.666667  6.000000  3.333333  4.000000
## [78]  6.000000  6.666667  4.000000  5.666667  6.333333  3.666667  6.333333

Part C.

The histograms are fairly similar but the t_y_ratio is a bit more spread.

hist(test$t_y_ratio)

hist(test$N_ybar)

Part D.

The mean and variance of t_y_ratio are 82.79241 and 14.03418.

The mean and variance of N_ybar are 82 and 22.1207

The bias of t_y_ratio is 0.089106 Using the

mean(test$t_y_ratio)
## [1] 82.79241
sd(test$t_y_ratio)
## [1] 14.03418
mean(test$N_ybar)
## [1] 82
sd(test$N_ybar)
## [1] 22.1207
-cov(test$B_hat, test$x_bar)
## [1] 0.08910597

Part E.

The approximate bias of t_y_ratio is 0.07577157. The difference between the true bias and the approximate bias is 0.0133344.

bias_t_y_r <-(1-3/9)*(1/(3*7.6667))*(B*s_x^2 - R*s_x*s_y) 

0.08910597- 0.07577157
## [1] 0.0133344

Chapter 10

  1. The plot is below. There seems to be
require(stats)
require(graphics)
trees
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
plot(trees$Girth, trees$Volume, ylab="Volume", xlab="Diameter", main = 
"Cherry Trees")

  1. I had to look up what girth was and it is the circumference so I had to divide by pi to get the diameter.

The confidence interval is from (84548.15, 105996.2).

N = 2967
xbar <- mean(trees$Girth)
ybar <- mean(trees$Volume)

b <- ybar/xbar
b
## [1] 2.277331
t_y_ratio <- b*41835
t_y_ratio
## [1] 95272.16
trees
##    Girth Height Volume
## 1    8.3     70   10.3
## 2    8.6     65   10.3
## 3    8.8     63   10.2
## 4   10.5     72   16.4
## 5   10.7     81   18.8
## 6   10.8     83   19.7
## 7   11.0     66   15.6
## 8   11.0     75   18.2
## 9   11.1     80   22.6
## 10  11.2     75   19.9
## 11  11.3     79   24.2
## 12  11.4     76   21.0
## 13  11.4     76   21.4
## 14  11.7     69   21.3
## 15  12.0     75   19.1
## 16  12.9     74   22.2
## 17  12.9     85   33.8
## 18  13.3     86   27.4
## 19  13.7     71   25.7
## 20  13.8     64   24.9
## 21  14.0     78   34.5
## 22  14.2     80   31.7
## 23  14.5     74   36.3
## 24  16.0     72   38.3
## 25  16.3     77   42.6
## 26  17.3     81   55.4
## 27  17.5     82   55.7
## 28  17.9     80   58.3
## 29  18.0     80   51.5
## 30  18.0     80   51.0
## 31  20.6     87   77.0
se_sqr <- 1/30*sum(((trees$Volume-(b*trees$Girth))**2))
se <- sqrt(se_sqr)
se
## [1] 9.698086
fpc <- (1-31/2967)

SE_t_y_ratio <- fpc*((41835/xbar)^2)*(se_sqr/31)
s <- sqrt(SE_t_y_ratio)

t_y_ratio + (1.96 * s)
## [1] 105996.2
t_y_ratio - (1.96 * s)
## [1] 84548.15