Problem 1- 4.11

Part A.

Histogram below; data is right skewed

library(SDaA)
n <- nrow(counties)
head(counties)

##    rn state          county landarea totpop physicia enroll percpub
## 1  27    AL        Escambia      948  36023       24   6931    95.4
## 2  48    AL        Marshall      567  73524       44  11928    98.6
## 3  85    AK Prince of Wales     7325   6408        7   1317    98.6
## 4 126    AR           Cross      616  19261       11   4066    99.1
## 5 158    AR          Newton      823   7649        3   1579    99.2
## 6 186    CA           Butte     1640 188377      327  27899    94.5
##   civlabor unemp farmpop numfarm farmacre fedgrant fedciv milit veterans
## 1    15247  1339     531     414    90646    122.3     85   370     3723
## 2    38803  3189    1592    1582   136599    235.7    316   748     8510
## 3     2787   383      71       2      214     32.2    126    63      809
## 4     8336   704     762     492   339830     81.4     87   107     1505
## 5     3280   270     600     562    98106     31.7     71    44      807
## 6    77500  7303    2818    2030   494530    688.0    570   577    23958
##   percviet
## 1     27.1
## 2     29.1
## 3     44.6
## 4     23.9
## 5     25.5
## 6     27.6

#Part A
hist(counties$physicia, main="Histogram of Number of Physicians for the 100 Counties", xlab="Physicians")

Part B.

\(N_\bar{y}\) = 933411 which is an estimate of the total number of physicians in the US. The standard error is 491982.8

y_bar <- mean(counties$physicia)
N <- 3141

N*y_bar

## [1] 933411

fpc <- 1-(100/N)
s_sqr <- var(counties$physicia)

N*sqrt(fpc*(s_sqr/100))

## [1] 491982.8

Part C.

I would argue that ratio estimation would be better to use here as the a line through the points would generally go through the origin.

plot(counties$physicia, counties$totpop, xlab="Number of Physicians", ylab="Total Pop for each County", pch=16)

ybar <- mean(counties$physicia)
xbar <- mean(counties$totpop)
     
B_hat <- ybar/xbar

plot(counties$physicia, counties$totpop, pch = 16)
abline(a=0,b=B_hat, col = "red", lwd = 3)

#Should we transform data? I emailed you and asked and didn't respond so I left it as is. 
fit <- lm(counties$physicia ~ log(counties$totpop))
plot(fit)

Part D.

#Ratio Estimation
B_hat

## [1] 0.002507104

tx <- 255077536 

t_hat_yr <- B_hat*tx
t_hat_yr #total number of physicians

## [1] 639506

se_inner <- sum((counties$physicia - B_hat*counties$totpop)^2)

se_sqr <- 1/(99)*se_inner
se <- sqrt(se_sqr)
se #Standard error of the ratio estimator

## [1] 415.0517

#Standard error of ratio estimator of the total population
sqrt(fpc)*(tx/mean(counties$totpop))*(se/10)

## [1] 87885.27

#Regression Estimation
x_bar_u <- tx/N
fit <- lm(counties$physicia ~ counties$totpop)
summary(fit)

## 
## Call:
## lm(formula = counties$physicia ~ counties$totpop)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -767.24  -43.44    4.84   32.93 3105.60 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     -5.423e+01  3.490e+01  -1.554    0.123    
## counties$totpop  2.965e-03  6.519e-05  45.477   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 340.3 on 98 degrees of freedom
## Multiple R-squared:  0.9548, Adjusted R-squared:  0.9543 
## F-statistic:  2068 on 1 and 98 DF,  p-value: < 2.2e-16

b_0 <- -54.23
b_1 <- 0.00296
y_bar_hat_reg <- b_0 + b_1*x_bar_u
y_bar_hat_reg

## [1] 186.1487

y_hat_reg <- y_bar_hat_reg*N
y_hat_reg

## [1] 584693.1

sqrt((fpc)*(1/100)*(1154644.1)*(1-0.955)) #SE y bar hat reg

## [1] 22.42872

se_y_reg <- N*33.316
se_y_reg #SE of y hat reg

## [1] 104645.6

Part E

From the above calculations using Ratio and Regression estimation. Both methods lead to small standard errors, but ratio estimation has a smaller standard error. However, regression estimation is closer to the true value of physicians which is 532,638.

Problem 2- 4.16

The population mean \(\bar{y}_{post}\) = 295560.8

The variance = 267953512 and the SE = 16369.29 The 95% confidence interval is (263477,327644.6). This method provides a narrower confidence interval compared to the CI from SRS which is (260706,335088). I would argue that this would poststratification would be the better sampling method to use.

library(SDaA)
data("agpop")
N <- nrow(agpop)
n <- 300
Nh <- c(220, 1054, 1382, 422)
nh <- c(21,103,135,41)
y_bar_nh <- c(97629.8, 300504.2, 211315, 662295.5)
var_nh <- c(7647472708,29618183543,53587487856,396185950266)

#Population Mean
y_bar_post <- (Nh[1]/N)*(y_bar_nh[1]) + (Nh[2]/N)*(y_bar_nh[2]) + (Nh[3]/N)*(y_bar_nh[3]) + (Nh[4]/N)*(y_bar_nh[4])
y_bar_post

## [1] 295560.8

#Variance
fpc <- (1-n/N)

var_ybar_post <- fpc*((Nh[1]/N)*(var_nh[1])/n + (Nh[2]/N)*(var_nh[2])/n + (Nh[3]/N)*(var_nh[3])/n + (Nh[4]/N)*(var_nh[4])/n)
var_ybar_post

## [1] 267953512

se_ybar_post <- sqrt(var_ybar_post)
se_ybar_post

## [1] 16369.29

#Confidence Interval
CI_lower <- y_bar_post - 1.96*se_ybar_post
CI_lower

## [1] 263477

CI_upper <- y_bar_post + 1.96*se_ybar_post
CI_upper

## [1] 327644.6

Problem 3 - 4.38

Our estimate of total words in the dictionary is 16443.63 with a SE of 705.0922. Our estimate of total words Kajal knows is 14953.2 with a SE of 663.4724.

The percentage of the words I know 90.93611%.

#Total Number of pages = 733 Total Pages 
#Easier English -Intermediate Dictionary Second Edition 

set.seed(1234)
page_num <- sample(1:733, 30, replace=F)
sort(page_num)

##  [1]   7  29  33  84 114 134 156 166 169 192 204 206 211 216 226 372 373
## [18] 394 446 456 467 483 502 574 587 602 628 646 665 732

#Data Collection - THIS TOOK FOREVER 
x <- c(15, 30, 26, 24, 27, 22, 19, 21, 23, 24, 22, 27, 21, 26, 19, 21, 19, 25, 19, 24, 25, 34, 4, 18, 24, 21, 17, 21, 26, 29)

y <- c(12, 27, 26, 22, 26, 20, 18, 18, 17, 23, 17, 22, 19, 22, 18, 20, 17, 24, 19, 24, 22, 29, 3, 18, 21, 20, 16, 21, 23, 28)

N <- 733
n <- 30 

x_bar <- sum(x)/n
x_bar

## [1] 22.43333

y_bar <- sum(y)/n
y_bar

## [1] 20.4

#Estimate of total number of words on the pages
N*x_bar

## [1] 16443.63

#Estimate of total number of words that are known by Kajal
N*y_bar

## [1] 14953.2

#Standard Error for the total number of words that are on the pages
fpc <- 1-(30/733)
N*sqrt(fpc*(var(x)/30))

## [1] 705.0922

#Standard Error for the total number of words Kajal knows from pages
N*sqrt(fpc*(var(y)/30))

## [1] 663.4724

#Estimate of percentage of known words
(N*y_bar)/(N*x_bar)

## [1] 0.9093611

Problem 4 - 5.1

No, these estimates are not valid. Individuals in the same household or residence will most likely think alike. It is probably better to cluser based on households than to perform another sampling method.

Problem 5 - 5.2

Part A.

We are attempting to find the percentage of children that have access to guns in their houses.

This is a cluster sample because there are two layers of sampling that are occuring. The first layer, or the PSU (Primary Sampling Unit) is the selected clinic we are sampling. The second layer, or the SSU (Secondary Sampling Unit) is the parents. This is one stage sampling, because you are sampling PSU and then treating the SSUs like a census.

We would estimate the percentage of households with guns by using the formula

\(\hat{p}\) = \(\sum\frac{m_i\hat{p_c}}{m_o}\)

Here, \(m_o\) = \(\sum m_i\) where \(m_i\) is the number of secondary sampling units in the sample from PSU i. The standard error formula can be found in the book.

Part B.

The sampling population for this study is the number of households. This sampling method is not a representative sample of households with children parents who have more sick kids may be neglectful parents and may also have guns. That is a strong assumption to make, but it could affect the “representativeness” of the sample.

Problem 6 - 5.6

The estimated total number of worm fragments is \(\hat{t}_{unb}\) = 50653.33.

\(s^2_t\) = 2611.879

The variance of the unbiased estimator is 72145966 and the standard error is 8493.878.

The 95% confidence interval for the estimated total number of worm fragments is (34005.33,67301.33).

\(\hat{V}_{WR}\) = 73219669 is the estimator of variane for a cluster sample with replacement.

When we compare the variances, we see that the ratio is 0.9853359, suggesting they are very similar. The 95% CI for \(\hat{V}_{WR}\) is (33881.91, 67424.76) which is similar to the 95% confidence interval for \(\hat{t}_{unb}\).

N <- 580
n <- 12
M <- 24
m <- 3

can_o_worms <- c(1, 4, 0, 3, 4, 0, 5, 3, 7, 3, 4, 0, 5, 2, 1, 6, 9, 7, 5, 0, 3, 1, 7, 0, 7, 4, 2, 6, 8, 3, 1, 2, 5, 4, 9, 0)
worm_matrix <- matrix(can_o_worms, nrow = 3, byrow = 1)
worm_matrix

##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
## [1,]    1    4    0    3    4    0    5    3    7     3     4     0
## [2,]    5    2    1    6    9    7    5    0    3     1     7     0
## [3,]    7    4    2    6    8    3    1    2    5     4     9     0

#Total Number of Worm Fragments
w_ij <- (N * M) / (n * m)
t_hat_unb <- w_ij * sum(can_o_worms)
t_hat_unb

## [1] 50653.33

#Two Stage Sampling Variance
t_i <- M * colMeans(worm_matrix)
s2_t <- 1 / (n - 1) * sum((t_i - t_hat_unb / N) ^ 2)
s2_i <- apply(worm_matrix, 2, var)
v_hat_t_hat_unb <- (N^2) * (1 - n / N) * (s2_t / n) + (N / n) * sum((1 - m / M) * (M^2) * (s2_i / m))
v_hat_t_hat_unb

## [1] 72145966

sqrt(v_hat_t_hat_unb)

## [1] 8493.878

#Confidence Interval 95% 
t_hat_unb + 1.96*sqrt(v_hat_t_hat_unb)

## [1] 67301.33

t_hat_unb - 1.96*sqrt(v_hat_t_hat_unb)

## [1] 34005.33

#V_WR From Book
v_WR <- (N^2) * (s2_t / n)
v_WR

## [1] 73219669

#Comparison 

#95% CI V WR
t_hat_unb + 1.96*sqrt(v_WR)

## [1] 67424.76

t_hat_unb - 1.96*sqrt(v_WR)

## [1] 33881.91

#Comparision of SAS Vwr and Vt_hat_unb
v_hat_t_hat_unb/v_WR

## [1] 0.9853359

STAT 488

Kajal Chokshi

11/1/2018

Problem 1- 4.11

Part A.

Part B.

Part C.

Part D.

Part E

Problem 2- 4.16

Problem 3 - 4.38

Problem 4 - 5.1

Problem 5 - 5.2

Part A.

Part B.

Problem 6 - 5.6