library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(mlbench)
library(foreach)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
## The following object is masked from 'package:dplyr':
## 
##     combine
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(caretEnsemble)
## 
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
## 
##     autoplot
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(sf)
## Warning: package 'sf' was built under R version 4.2.3
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tidycensus) 
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   1.0.0
## ✔ tidyr   1.2.1     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate()       masks foreach::accumulate()
## ✖ caretEnsemble::autoplot() masks ggplot2::autoplot()
## ✖ randomForest::combine()   masks dplyr::combine()
## ✖ dplyr::filter()           masks stats::filter()
## ✖ dplyr::lag()              masks stats::lag()
## ✖ purrr::lift()             masks caret::lift()
## ✖ randomForest::margin()    masks ggplot2::margin()
## ✖ purrr::when()             masks foreach::when()
library(readxl)
library(sampling)
## 
## Attaching package: 'sampling'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
library(dplyr)
library("tigris")
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library("leaflet")
## Warning: package 'leaflet' was built under R version 4.2.3

10.3. A two-stage survey of persons is to be done in which 5% of persons age 35 and under and 15% of persons over 35 will be sampled. Four PSUs will be selected using the composite MOS defined in Sect.10.5. A self-weighting sample is to be selected within each domain, and the workload should be the same in each selected PSU.

Compute the following:

f_data <- c(0.05, 0.15)
m <- 4 
df <- data.frame(PSU = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 
                 Domain_1 = c(80, 60, 50, 80, 50, 90, 50, 50, 55, 50), 
                 Domain_2 = c(20, 20, 90, 10, 25, 20, 80, 65, 25, 50), 
                 Total_Count = c(100, 80, 140, 90, 75, 110, 130, 115, 80, 100))
domain_tot <- c(sum(df$Domain_1), sum(df$Domain_2))

n_data <- f_data * domain_tot
n_data
## [1] 30.75 60.75
30.75+60.75
## [1] 91.5

The expected sample sizes for domains 1 and 2 are 30.75 and 60.75.Therefore, the total expected sample size is 91.5.

  1. Composite MOS for each PSU and the total across PSUs. Verify that the grand total equals the total expected sample size.
MOS <- df$Domain_1*f_data[1]+df$Domain_2*f_data[2]
df$MOS <- MOS
df[c("PSU", "MOS")]
##    PSU   MOS
## 1    1  7.00
## 2    2  6.00
## 3    3 16.00
## 4    4  5.50
## 5    5  6.25
## 6    6  7.50
## 7    7 14.50
## 8    8 12.25
## 9    9  6.50
## 10  10 10.00
sum(MOS)
## [1] 91.5

The total number equals the expected total number of samples, both 91.5.

  1. Selection probability for each PSU.
pi <- round(m*MOS/n_data,3)
df$pi <- pi 
df[c("PSU", "pi")] 
##    PSU    pi
## 1    1 0.911
## 2    2 0.395
## 3    3 2.081
## 4    4 0.362
## 5    5 0.813
## 6    6 0.494
## 7    7 1.886
## 8    8 0.807
## 9    9 0.846
## 10  10 0.658
  1. Domain sampling rate and expected domain sample size within each PSU. Are the expected sample sizes integers? If not, what method can be used for sampling within a PSU that will achieve the desired rate?
n_bar <- 22.875

df$domain_rate_1 <- n_bar * f_data[1] / MOS
df$domain_rate_2 <- n_bar * f_data[2] / MOS

df$expected_size_domain_1 <- df$domain_rate_1 * df$Domain_1
df$expected_size_domain_2 <- df$domain_rate_2 * df$Domain_2

df$expected_size <- df$expected_size_domain_1 + df$expected_size_domain_2

df_selected <- df[c("expected_size_domain_1", "expected_size_domain_2", "expected_size")]

print(df_selected)
##    expected_size_domain_1 expected_size_domain_2 expected_size
## 1               13.071429               9.803571        22.875
## 2               11.437500              11.437500        22.875
## 3                3.574219              19.300781        22.875
## 4               16.636364               6.238636        22.875
## 5                9.150000              13.725000        22.875
## 6               13.725000               9.150000        22.875
## 7                3.943966              18.931034        22.875
## 8                4.668367              18.206633        22.875
## 9                9.677885              13.197115        22.875
## 10               5.718750              17.156250        22.875

Both the PSU-level workload and the expected domain sample size within each PSU are non-integerized. Therefore, the field units must be sampled at a specified rate rather than a fixed number of units based on a rounded sample size.

  1. Verify that the expected sample sizes for any four of the PSUs sum to the total expected sample size you computed in (a).
sums <- df %>%
  sample_n(4) %>%
  summarise(
    Sum_Domain_1 = sum(expected_size_domain_1),
    Sum_Domain_2 = sum(expected_size_domain_2)
  ) %>%
  summarise(Total_Sum = sum(Sum_Domain_1, Sum_Domain_2))

sums
##   Total_Sum
## 1      91.5

Create a map of Census tracts in Washington DC and denote the population counts. Do the same with the proportion of non-Hispanic Blacks in the population. Discuss your observation in relation to the median income demonstrated during the class.

dc_trt <- tracts("District of Columbia", year=2020) 
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |===                                                                   |   5%
  |                                                                            
  |=======                                                               |  10%
  |                                                                            
  |==========                                                            |  14%
  |                                                                            
  |=============                                                         |  19%
  |                                                                            
  |================                                                      |  23%
  |                                                                            
  |===================                                                   |  28%
  |                                                                            
  |=======================================                               |  55%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |=============================================                         |  64%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |===================================================                   |  73%
  |                                                                            
  |=======================================================               |  78%
  |                                                                            
  |==========================================================            |  82%
  |                                                                            
  |=============================================================         |  87%
  |                                                                            
  |================================================================      |  92%
  |                                                                            
  |===================================================================   |  96%
  |                                                                            
  |======================================================================| 100%
dc_trt_trans <- st_transform(dc_trt, 
                             "+proj=longlat +datum=WGS84")
dc_blocks <- block_groups("District of Columbia", year=2020)
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |==                                                                    |   3%
  |                                                                            
  |====                                                                  |   6%
  |                                                                            
  |======                                                                |   9%
  |                                                                            
  |=========                                                             |  12%
  |                                                                            
  |===========                                                           |  15%
  |                                                                            
  |===============                                                       |  21%
  |                                                                            
  |=================                                                     |  24%
  |                                                                            
  |=====================                                                 |  30%
  |                                                                            
  |=======================                                               |  33%
  |                                                                            
  |=========================                                             |  36%
  |                                                                            
  |=============================                                         |  42%
  |                                                                            
  |===============================                                       |  45%
  |                                                                            
  |=================================                                     |  48%
  |                                                                            
  |====================================                                  |  51%
  |                                                                            
  |======================================                                |  54%
  |                                                                            
  |==========================================                            |  60%
  |                                                                            
  |============================================                          |  63%
  |                                                                            
  |==============================================                        |  66%
  |                                                                            
  |================================================                      |  69%
  |                                                                            
  |==================================================                    |  72%
  |                                                                            
  |======================================================                |  78%
  |                                                                            
  |========================================================              |  81%
  |                                                                            
  |==========================================================            |  84%
  |                                                                            
  |===============================================================       |  89%
  |                                                                            
  |=================================================================     |  92%
  |                                                                            
  |===================================================================   |  95%
  |                                                                            
  |===================================================================== |  98%
  |                                                                            
  |======================================================================| 100%
dc_blocks_trans <- st_transform(dc_blocks, "+proj=longlat +datum=WGS84")
api_key <-"fce55754f9cb076c7b0e0b86782d6ba12191c977" 
census_api_key(key=api_key, install=T, overwrite = TRUE) 
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY"). 
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "fce55754f9cb076c7b0e0b86782d6ba12191c977"
# "install=T" stores your api_key
# If stored, checking api_key as follows:
#    readRenviron("~/.Renviron")
#    Sys.getenv("CENSUS_API_KEY")
DC_BLACK <- get_acs(geography = "tract", 
                    variables = "B01001B_001", # count of Blacks 
                    state = "District of Columbia",
                    geometry = TRUE,
                    summary_var="B01001_001")
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
## 
  |                                                                            
  |                                                                      |   0%
  |                                                                            
  |====================================================                  |  75%
  |                                                                            
  |======================================================================| 100%
pal1 <- colorBin(palette = "Greys", 
                 domain = DC_BLACK$summary_est, bins=10)
DC_BLACK %>%
  st_transform(crs = "+init=epsg:4326") %>%
  leaflet(width = "100%") %>%
  addProviderTiles(provider = "CartoDB.Positron") %>%
  addPolygons(popup = ~GEOID,
              color = "black",
              weight = 1)%>%
  addPolygons(popup = ~ str_extract(NAME, "^([^,]*)"),
              stroke = FALSE,
              smoothFactor = 0,
              fillOpacity = 0.7,
              color = ~ pal1(summary_est)) %>%
  addLegend("bottomleft", 
            pal = pal1, 
            values = ~ summary_est,
            title = "Total population - Tract",
            opacity = 1)
## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.

B06011_001 is the median income

DC_INCOME <- get_acs(geography = "tract", 
                     variables = "B06011_001",  
                     state = "District of Columbia",
                     geometry = TRUE) 
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
palette <- colorBin(palette = "Blues", 
                 domain = DC_INCOME$estimate, bins=10)
DC_INCOME %>%
  st_transform(crs = "+init=epsg:4326") %>% # transform in order to use leaflet function
  leaflet(width = "100%") %>%
  addProviderTiles(provider = "CartoDB.Positron") %>%
  addPolygons(popup = ~GEOID,
              color = "black",
              weight = 1)%>%
  addPolygons(popup = ~ str_extract(NAME, "^([^,]*)"),
              stroke = FALSE,
              smoothFactor = 0,
              fillOpacity = 0.7,
              color = ~ palette(estimate)) %>%
  addLegend("bottomleft", 
            pal = palette, 
            values = ~ estimate,
            title = "Median income - Tract",
            labFormat = labelFormat(prefix = "$"),
            opacity = 1) 
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
DC_NONHIS_BLACK <- get_acs(geography = "tract", 
                    variables = "B03002_004", 
                    state = "District of Columbia",
                    geometry = TRUE,
                    summary_var="B01001_001")  
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website.  To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
pal2 <- colorBin(palette = "BuGn", 
                 domain = DC_NONHIS_BLACK$estimate/DC_NONHIS_BLACK$summary_est*100, 
                 bins=10)

DC_NONHIS_BLACK %>%
  st_transform(crs = "+init=epsg:4326") %>%
  leaflet(width = "100%") %>%
  addProviderTiles(provider = "CartoDB.Positron") %>%
  addPolygons(popup = ~GEOID,
              color = "black",
              weight = 1)%>%
  addPolygons(popup = ~ str_extract(NAME, "^([^,]*)"),
              stroke = FALSE,
              smoothFactor = 0,
              fillOpacity = 0.7,
              color = ~ pal2(estimate/summary_est*100)) %>%
  addLegend("bottomleft", 
            pal = pal2, 
            values = ~ estimate/summary_est*100,
            title = "Proportion of Non-Hispanic Blacks - Tract",
            labFormat = labelFormat(suffix="%"),
            opacity = 1)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette BuGn is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette BuGn is 9
## Returning the palette you asked for with that many colors

## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette BuGn is 9
## Returning the palette you asked for with that many colors

Comparing the two median income graphs, we can see that areas with a higher percentage of non-Hispanic black residents tend to have lower median incomes.