library(haven)
library(dplyr)
setwd("/Users/isaiahmireles/Desktop")
wr_df <-
read_dta("SamplingPrac/usfacts.dta") |>
select(state, home, area, density)
wor_df <-
read_dta("SamplingPrac/usfacts.dta") |>
select(state, home, area, density)
Infinite Pop. Assumption :
set.seed(21)
s_wr <- sample(wr_df$state, 5,replace = T)
cat(s_wr)
## New York Arizona South Dakota Virginia Iowa
Finite Population ( \(N=51\) – inc. District of Columbia)
set.seed(25)
s_wor <- sample(wor_df$state, 5,replace = T)
cat(s_wor)
## Connecticut Nevada Minnesota Mississippi West Virginia
wor_df <- wor_df[wor_df$state
%in% s_wor,]
wor_df
wr_df <- wr_df[wr_df$state
%in% s_wr,]
wr_df
Abritrary Func. :
library(tidyverse)
us_map <- map_data("state") # get mp dat
generate_plts <- function(df) {
df <-
df |>
mutate(state = tolower(state))
map_df <-
us_map |>
right_join(df, by = c("region" = "state")) |>
select(-subregion)
p1 <-
ggplot(map_df, aes(long, lat, group = group, fill = density)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
scale_fill_viridis_c(option = "C", name = "Density") +
ggtitle("Population Density")
p2 <-
ggplot(map_df, aes(long, lat, group = group, fill = area)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
scale_fill_distiller(palette = "Blues", name = "Land Area") +
ggtitle("Land Area")
p3 <-
ggplot(map_df, aes(long, lat, group = group, fill = home)) +
geom_polygon(color = "white") +
coord_fixed(1.3) +
scale_fill_distiller(palette = "Reds", name = "Homes") +
ggtitle("Number of Homes")
list(p1 = p1, p2 = p2, p3 = p3)
}
plots <- generate_plts(wr_df)
plots$p1
plots$p2
plots$p3
plots <- generate_plts(wor_df)
plots$p1
plots$p2
plots$p3
mean(wor_df$home)
## [1] 1296834
mean(wr_df$home)
## [1] 2935930
Recall our est. are different as they come from different states :
print("wor")
## [1] "wor"
cat(wor_df$state)
## Connecticut Minnesota Mississippi Nevada West Virginia
print("wr")
## [1] "wr"
cat(wr_df$state)
## Arizona Iowa New York South Dakota Virginia
Recall :
\[ \pi_i=\frac{n}{N} \]
and,
\[ w_i =\frac{1}{\pi_i} \]
\[ \hat{\tau} =w_i\sum_{i \in s}y_i \]
\[ \hat{\tau} = \\ \frac{N}{n}\sum y_i= \\ N*(\frac{1}{n}\sum y_i) = N*(\bar{y}) \\ =N\bar{y} \]
xbar <- mean(wor_df$area)
N <- 51
Tot_Est_wor <- xbar*N
Tot_Est_wor
## [1] 2705713
# or,
n <- 5
wt <- N/n
sum(wor_df$area)*wt
## [1] 2705713
As we can see^ same number for our \(\tau\) est
xbar <- mean(wr_df$area)
N <- 51
Tot_Est_wr <- xbar*N
Tot_Est_wr
## [1] 3388409
prop_wor <- sum((wor_df$density < 100))/length(wor_df$density)
prop_wor
## [1] 0.8
prop_wr <- sum((wr_df$density < 100))/length(wr_df$density)
prop_wr
## [1] 0.6
We can say \(\frac{N-n}{N-1} \approx \frac{N-n}{N}\) as, \(N-1 \approx N \iff \text{N is large}\)
FPC <- (N-n)/(N-1)
s2 <- sum((wor_df$home - mean(wor_df$home))^2)
s <- sqrt(s2)
We for each example are estimating : \(\hat{\tau},\bar{x},\hat{p}\) :
\[ \hat{\theta}\pm Z_{c} *\text{S.E.}(\hat{\theta}) \]
z_crit <- qnorm(1-.024)
Where \(\text{S.E.}(\bar{x})=\sqrt{\hat{V}_{\bar{x}}}=\frac{s}{\sqrt{n}}\)
\[ \bar{x}\pm Z_{c} * \text{S.E.}(\bar{x}) \]
xbar <- mean(wr_df$home)
SE <- s/sqrt(n)
ME <- z_crit*SE
CI_wr <- xbar + c(-1,1)*ME
CI_wr
## [1] 2019471 3852388
SE
## [1] 463473.8
Notice that the variance is S.E. is different because the standard err. is :
\[ \text{S.E.}(\hat{\tau}) \\ = \text{S.E.}(N\bar{x}) \\ = \sqrt{\text{Var}(N\bar{x})} \]
and,
\[ \text{Var}(N\bar{x}) \\ =N^2\text{Var}(\bar{x}) \]
and as we know from our variance est of \(\bar{x}\),
\[ \text{Var}(\bar{x}) :=\frac{\sigma^2}{n}\approx\frac{s^2}{n} \]
So :
\[ \sqrt{N^2\frac{s^2}{n}} \]
N2 <- N^2
SE <- sqrt(N2 * s2/n)
ME <- z_crit*SE
CI_wr <- Tot_Est_wr + c(-1,1)*ME
CI_wr
## [1] -43350970 50127789
SE
## [1] 23637163
Notice we need the variance of \(\hat{p}\)
\[ \text{Var}(\hat{p})=\frac{\sum y_i}{n} \]
for :
\[ y_i =\begin{cases}1, & \text{if density < 100} \\0, & \text{otherwise}\end{cases} \]
So,
\[ \text{Var}(\hat{p})\\ =\text{Var}(\frac{\sum y_i}{n})\\ =\frac{1}{n}\text{Var}(\sum y_i) \] where we assume \(y_i \text{ is independent from } y_j\), so variance simplifies :
\[ =\frac{1}{n^2}\sum\text{Var}(y_i) \\ =\frac{1}{n^2}npp^c \\ =\frac{1}{n}pp^c \]
Therefore :
\[ \hat{p}\pm Z_c * \sqrt{\frac{1}{n}\hat{p}\hat{p}^c} \]
SE <- sqrt((1/n)*prop_wr*(1-prop_wr))
ME <- z_crit*SE
CI_wr <- prop_wr + c(-1,1)*ME
CI_wr
## [1] 0.1667803 1.0332197
SE
## [1] 0.219089
xbar <- mean(wor_df$home)
SE <- s/sqrt(n)
ME <- z_crit*SE
CI_wor <- xbar + c(-1,1)*ME
CI_wor
## [1] 380375.6 2213292.4
SE
## [1] 463473.8
N2 <- N^2
SE <- sqrt(N2 * s2/n)
ME <- z_crit*SE
CI_wr <- Tot_Est_wor + c(-1,1)*ME
CI_wr
## [1] -44033667 49445093
SE
## [1] 23637163
SE <- sqrt((1/n)*prop_wor*(1-prop_wor))
ME <- z_crit*SE
CI_wor <- prop_wor + c(-1,1)*ME
CI_wor
## [1] 0.4462776 1.1537224
SE
## [1] 0.1788854
\[ \bar{x}\pm Z_{c} (\frac{s}{\sqrt{n}}*\sqrt{\text{FPC}}) \]
Where : \(\text{FPC} = \frac{N-n}{N-1}\)
xbar <- mean(wor_df$home)
SE <- s/sqrt(n)*sqrt(FPC)
ME <- z_crit*SE
CI_wor <-xbar + c(-1,1)*ME
CI_wor
## [1] 417798 2175870
SE
## [1] 444548.4
N2 <- N^2
SE <- sqrt(N2 * s2/n)*sqrt(FPC)
ME <- z_crit*SE
CI_wr <- Tot_Est_wor + c(-1,1)*ME
CI_wr
## [1] -42125125 47536551
SE
## [1] 22671970
SE <- sqrt((1/n)*prop_wor*(1-prop_wor))*sqrt(FPC)
ME <- z_crit*SE
CI_wor <- prop_wor + c(-1,1)*ME
CI_wor
## [1] 0.4607214 1.1392786
SE
## [1] 0.1715809