library(haven)
library(dplyr)
setwd("/Users/isaiahmireles/Desktop")
wr_df <- 
  read_dta("SamplingPrac/usfacts.dta") |> 
  select(state, home, area, density)
wor_df <- 
  read_dta("SamplingPrac/usfacts.dta") |> 
  select(state, home, area, density)

Sampling

With Replacement :

Infinite Pop. Assumption :

set.seed(21)
s_wr <- sample(wr_df$state, 5,replace = T)
cat(s_wr)
## New York Arizona South Dakota Virginia Iowa

Without Replacement :

Finite Population ( \(N=51\) – inc. District of Columbia)

set.seed(25)
s_wor <- sample(wor_df$state, 5,replace = T)
cat(s_wor)
## Connecticut Nevada Minnesota Mississippi West Virginia

Generate df :

wor_df <- wor_df[wor_df$state
 %in% s_wor,]
wor_df
wr_df <- wr_df[wr_df$state
 %in% s_wr,]
wr_df

Graphics :

Abritrary Func. :

library(tidyverse)
us_map <- map_data("state") # get mp dat

generate_plts <- function(df) {

  df <-
    df |>
    mutate(state = tolower(state))

  map_df <-
    us_map |>
    right_join(df, by = c("region" = "state")) |>
    select(-subregion)

  p1 <-
    ggplot(map_df, aes(long, lat, group = group, fill = density)) +
    geom_polygon(color = "white") +
    coord_fixed(1.3) +
    scale_fill_viridis_c(option = "C", name = "Density") +
    ggtitle("Population Density")

  p2 <-
    ggplot(map_df, aes(long, lat, group = group, fill = area)) +
    geom_polygon(color = "white") +
    coord_fixed(1.3) +
    scale_fill_distiller(palette = "Blues", name = "Land Area") +
    ggtitle("Land Area")

  p3 <-
    ggplot(map_df, aes(long, lat, group = group, fill = home)) +
    geom_polygon(color = "white") +
    coord_fixed(1.3) +
    scale_fill_distiller(palette = "Reds", name = "Homes") +
    ggtitle("Number of Homes")

  list(p1 = p1, p2 = p2, p3 = p3)
}

w/ Replacement

plots <- generate_plts(wr_df)

plots$p1

plots$p2

plots$p3

w/out Replacement

plots <- generate_plts(wor_df)

plots$p1

plots$p2

plots$p3

Mean Homes Est.

w/out Replacement

mean(wor_df$home)
## [1] 1296834

w/ Replacement

mean(wr_df$home)
## [1] 2935930

Recall our est. are different as they come from different states :

print("wor")
## [1] "wor"
cat(wor_df$state)
## Connecticut Minnesota Mississippi Nevada West Virginia
print("wr")
## [1] "wr"
cat(wr_df$state)
## Arizona Iowa New York South Dakota Virginia

Total Area Est.

Recall :

\[ \pi_i=\frac{n}{N} \]

and,

\[ w_i =\frac{1}{\pi_i} \]

\[ \hat{\tau} =w_i\sum_{i \in s}y_i \]

\[ \hat{\tau} = \\ \frac{N}{n}\sum y_i= \\ N*(\frac{1}{n}\sum y_i) = N*(\bar{y}) \\ =N\bar{y} \]

w/out Replacement

xbar <- mean(wor_df$area)
N <- 51
Tot_Est_wor <- xbar*N
Tot_Est_wor
## [1] 2705713
# or, 
n <- 5
wt <- N/n
sum(wor_df$area)*wt
## [1] 2705713

As we can see^ same number for our \(\tau\) est

w/ Replacement

xbar <- mean(wr_df$area)
N <- 51
Tot_Est_wr <- xbar*N
Tot_Est_wr
## [1] 3388409

Proportion of Density Below 100

w/out Replacement

prop_wor <- sum((wor_df$density < 100))/length(wor_df$density)
prop_wor
## [1] 0.8

w/ Replacement

prop_wr <- sum((wr_df$density < 100))/length(wr_df$density)
prop_wr
## [1] 0.6

FPC

We can say \(\frac{N-n}{N-1} \approx \frac{N-n}{N}\) as, \(N-1 \approx N \iff \text{N is large}\)

FPC <- (N-n)/(N-1)
s2 <- sum((wor_df$home - mean(wor_df$home))^2)
s <- sqrt(s2)

Confidence Interval : 95% conf

We for each example are estimating : \(\hat{\tau},\bar{x},\hat{p}\) :

\[ \hat{\theta}\pm Z_{c} *\text{S.E.}(\hat{\theta}) \]

z_crit <- qnorm(1-.024)

W/ Replacement

Mean Homes Interval : Infinite Population :

Where \(\text{S.E.}(\bar{x})=\sqrt{\hat{V}_{\bar{x}}}=\frac{s}{\sqrt{n}}\)

\[ \bar{x}\pm Z_{c} * \text{S.E.}(\bar{x}) \]

xbar <- mean(wr_df$home)
SE <- s/sqrt(n)
ME <- z_crit*SE
CI_wr <- xbar + c(-1,1)*ME
CI_wr
## [1] 2019471 3852388

Standard Err :

SE
## [1] 463473.8

Total US Land Area Interval : Infinite Population :

Notice that the variance is S.E. is different because the standard err. is :

\[ \text{S.E.}(\hat{\tau}) \\ = \text{S.E.}(N\bar{x}) \\ = \sqrt{\text{Var}(N\bar{x})} \]

and,

\[ \text{Var}(N\bar{x}) \\ =N^2\text{Var}(\bar{x}) \]

and as we know from our variance est of \(\bar{x}\),

\[ \text{Var}(\bar{x}) :=\frac{\sigma^2}{n}\approx\frac{s^2}{n} \]

So :

\[ \sqrt{N^2\frac{s^2}{n}} \]

N2 <- N^2
SE <- sqrt(N2 * s2/n)
ME <- z_crit*SE
CI_wr <- Tot_Est_wr + c(-1,1)*ME
CI_wr
## [1] -43350970  50127789

Standard Err :

SE
## [1] 23637163

Low-density communities : Infinite Population :

Notice we need the variance of \(\hat{p}\)

\[ \text{Var}(\hat{p})=\frac{\sum y_i}{n} \]

for :

\[ y_i =\begin{cases}1, & \text{if density < 100} \\0, & \text{otherwise}\end{cases} \]

So,

\[ \text{Var}(\hat{p})\\ =\text{Var}(\frac{\sum y_i}{n})\\ =\frac{1}{n}\text{Var}(\sum y_i) \] where we assume \(y_i \text{ is independent from } y_j\), so variance simplifies :

\[ =\frac{1}{n^2}\sum\text{Var}(y_i) \\ =\frac{1}{n^2}npp^c \\ =\frac{1}{n}pp^c \]

Therefore :

\[ \hat{p}\pm Z_c * \sqrt{\frac{1}{n}\hat{p}\hat{p}^c} \]

SE <- sqrt((1/n)*prop_wr*(1-prop_wr))
ME <- z_crit*SE
CI_wr <- prop_wr + c(-1,1)*ME
CI_wr
## [1] 0.1667803 1.0332197

Standard Err :

SE
## [1] 0.219089

W/out Replacement & w/out FPC :

Mean Homes Interval : Finite Population :

xbar <- mean(wor_df$home)
SE <- s/sqrt(n)
ME <- z_crit*SE
CI_wor <- xbar + c(-1,1)*ME
CI_wor
## [1]  380375.6 2213292.4

Standard Err :

SE
## [1] 463473.8

Total US Land Area Interval : Finite Population :

N2 <- N^2
SE <- sqrt(N2 * s2/n)
ME <- z_crit*SE
CI_wr <- Tot_Est_wor + c(-1,1)*ME
CI_wr
## [1] -44033667  49445093

Standard Err :

SE
## [1] 23637163

Low-density communities : Finite Population :

SE <- sqrt((1/n)*prop_wor*(1-prop_wor))
ME <- z_crit*SE
CI_wor <- prop_wor + c(-1,1)*ME
CI_wor
## [1] 0.4462776 1.1537224

Standard Err :

SE
## [1] 0.1788854

W/out Replacement & w/ FPC :

Mean Homes Interval : Finite Pop. & FPC :

\[ \bar{x}\pm Z_{c} (\frac{s}{\sqrt{n}}*\sqrt{\text{FPC}}) \]

Where : \(\text{FPC} = \frac{N-n}{N-1}\)

xbar <- mean(wor_df$home)
SE <- s/sqrt(n)*sqrt(FPC)
ME <- z_crit*SE
CI_wor <-xbar + c(-1,1)*ME
CI_wor
## [1]  417798 2175870

Standard Err :

SE
## [1] 444548.4

Total US Land Area Interval : Finite Pop. & FPC :

N2 <- N^2
SE <- sqrt(N2 * s2/n)*sqrt(FPC)
ME <- z_crit*SE
CI_wr <- Tot_Est_wor + c(-1,1)*ME
CI_wr
## [1] -42125125  47536551

Standard Err :

SE
## [1] 22671970

Low-density communities : Finite Pop. & FPC :

SE <- sqrt((1/n)*prop_wor*(1-prop_wor))*sqrt(FPC)
ME <- z_crit*SE
CI_wor <- prop_wor + c(-1,1)*ME
CI_wor
## [1] 0.4607214 1.1392786

Standard Err :

SE
## [1] 0.1715809