library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mlbench)
library(foreach)
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
library(rpart)
## Warning: package 'rpart' was built under R version 4.2.3
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
## The following object is masked from 'package:ggplot2':
##
## autoplot
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(sf)
## Warning: package 'sf' was built under R version 4.2.3
## Linking to GEOS 3.11.0, GDAL 3.5.3, PROJ 9.1.0; sf_use_s2() is TRUE
library(tidycensus)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 1.0.0
## ✔ tidyr 1.2.1 ✔ stringr 1.5.0
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ caretEnsemble::autoplot() masks ggplot2::autoplot()
## ✖ randomForest::combine() masks dplyr::combine()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::lift() masks caret::lift()
## ✖ randomForest::margin() masks ggplot2::margin()
## ✖ purrr::when() masks foreach::when()
library(readxl)
library(sampling)
##
## Attaching package: 'sampling'
##
## The following object is masked from 'package:caret':
##
## cluster
library(dplyr)
library("tigris")
## To enable caching of data, set `options(tigris_use_cache = TRUE)`
## in your R script or .Rprofile.
library("leaflet")
## Warning: package 'leaflet' was built under R version 4.2.3
f_data <- c(0.05, 0.15)
m <- 4
df <- data.frame(PSU = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10),
Domain_1 = c(80, 60, 50, 80, 50, 90, 50, 50, 55, 50),
Domain_2 = c(20, 20, 90, 10, 25, 20, 80, 65, 25, 50),
Total_Count = c(100, 80, 140, 90, 75, 110, 130, 115, 80, 100))
domain_tot <- c(sum(df$Domain_1), sum(df$Domain_2))
n_data <- f_data * domain_tot
n_data
## [1] 30.75 60.75
30.75+60.75
## [1] 91.5
The expected sample sizes for domains 1 and 2 are 30.75 and 60.75.Therefore, the total expected sample size is 91.5.
MOS <- df$Domain_1*f_data[1]+df$Domain_2*f_data[2]
df$MOS <- MOS
df[c("PSU", "MOS")]
## PSU MOS
## 1 1 7.00
## 2 2 6.00
## 3 3 16.00
## 4 4 5.50
## 5 5 6.25
## 6 6 7.50
## 7 7 14.50
## 8 8 12.25
## 9 9 6.50
## 10 10 10.00
sum(MOS)
## [1] 91.5
The total number equals the expected total number of samples, both 91.5.
pi <- round(m*MOS/n_data,3)
df$pi <- pi
df[c("PSU", "pi")]
## PSU pi
## 1 1 0.911
## 2 2 0.395
## 3 3 2.081
## 4 4 0.362
## 5 5 0.813
## 6 6 0.494
## 7 7 1.886
## 8 8 0.807
## 9 9 0.846
## 10 10 0.658
n_bar <- 22.875
df$domain_rate_1 <- n_bar * f_data[1] / MOS
df$domain_rate_2 <- n_bar * f_data[2] / MOS
df$expected_size_domain_1 <- df$domain_rate_1 * df$Domain_1
df$expected_size_domain_2 <- df$domain_rate_2 * df$Domain_2
df$expected_size <- df$expected_size_domain_1 + df$expected_size_domain_2
df_selected <- df[c("expected_size_domain_1", "expected_size_domain_2", "expected_size")]
print(df_selected)
## expected_size_domain_1 expected_size_domain_2 expected_size
## 1 13.071429 9.803571 22.875
## 2 11.437500 11.437500 22.875
## 3 3.574219 19.300781 22.875
## 4 16.636364 6.238636 22.875
## 5 9.150000 13.725000 22.875
## 6 13.725000 9.150000 22.875
## 7 3.943966 18.931034 22.875
## 8 4.668367 18.206633 22.875
## 9 9.677885 13.197115 22.875
## 10 5.718750 17.156250 22.875
Both the PSU-level workload and the expected domain sample size within each PSU are non-integerized. Therefore, the field units must be sampled at a specified rate rather than a fixed number of units based on a rounded sample size.
sums <- df %>%
sample_n(4) %>%
summarise(
Sum_Domain_1 = sum(expected_size_domain_1),
Sum_Domain_2 = sum(expected_size_domain_2)
) %>%
summarise(Total_Sum = sum(Sum_Domain_1, Sum_Domain_2))
sums
## Total_Sum
## 1 91.5
dc_trt <- tracts("District of Columbia", year=2020)
##
|
| | 0%
|
|=== | 5%
|
|======= | 10%
|
|========== | 14%
|
|============= | 19%
|
|================ | 23%
|
|=================== | 28%
|
|======================================= | 55%
|
|========================================== | 60%
|
|============================================= | 64%
|
|================================================ | 69%
|
|=================================================== | 73%
|
|======================================================= | 78%
|
|========================================================== | 82%
|
|============================================================= | 87%
|
|================================================================ | 92%
|
|=================================================================== | 96%
|
|======================================================================| 100%
dc_trt_trans <- st_transform(dc_trt,
"+proj=longlat +datum=WGS84")
dc_blocks <- block_groups("District of Columbia", year=2020)
##
|
| | 0%
|
|== | 3%
|
|==== | 6%
|
|====== | 9%
|
|========= | 12%
|
|=========== | 15%
|
|=============== | 21%
|
|================= | 24%
|
|===================== | 30%
|
|======================= | 33%
|
|========================= | 36%
|
|============================= | 42%
|
|=============================== | 45%
|
|================================= | 48%
|
|==================================== | 51%
|
|====================================== | 54%
|
|========================================== | 60%
|
|============================================ | 63%
|
|============================================== | 66%
|
|================================================ | 69%
|
|================================================== | 72%
|
|====================================================== | 78%
|
|======================================================== | 81%
|
|========================================================== | 84%
|
|=============================================================== | 89%
|
|================================================================= | 92%
|
|=================================================================== | 95%
|
|===================================================================== | 98%
|
|======================================================================| 100%
dc_blocks_trans <- st_transform(dc_blocks, "+proj=longlat +datum=WGS84")
api_key <-"fce55754f9cb076c7b0e0b86782d6ba12191c977"
census_api_key(key=api_key, install=T, overwrite = TRUE)
## Your original .Renviron will be backed up and stored in your R HOME directory if needed.
## Your API key has been stored in your .Renviron and can be accessed by Sys.getenv("CENSUS_API_KEY").
## To use now, restart R or run `readRenviron("~/.Renviron")`
## [1] "fce55754f9cb076c7b0e0b86782d6ba12191c977"
# "install=T" stores your api_key
# If stored, checking api_key as follows:
# readRenviron("~/.Renviron")
# Sys.getenv("CENSUS_API_KEY")
DC_BLACK <- get_acs(geography = "tract",
variables = "B01001B_001", # count of Blacks
state = "District of Columbia",
geometry = TRUE,
summary_var="B01001_001")
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
##
|
| | 0%
|
|==================================================== | 75%
|
|======================================================================| 100%
pal1 <- colorBin(palette = "Greys",
domain = DC_BLACK$summary_est, bins=10)
DC_BLACK %>%
st_transform(crs = "+init=epsg:4326") %>%
leaflet(width = "100%") %>%
addProviderTiles(provider = "CartoDB.Positron") %>%
addPolygons(popup = ~GEOID,
color = "black",
weight = 1)%>%
addPolygons(popup = ~ str_extract(NAME, "^([^,]*)"),
stroke = FALSE,
smoothFactor = 0,
fillOpacity = 0.7,
color = ~ pal1(summary_est)) %>%
addLegend("bottomleft",
pal = pal1,
values = ~ summary_est,
title = "Total population - Tract",
opacity = 1)
## Warning in CPL_crs_from_input(x): GDAL Message 1: +init=epsg:XXXX syntax is
## deprecated. It might return a CRS with a non-EPSG compliant axis order.
B06011_001 is the median income
DC_INCOME <- get_acs(geography = "tract",
variables = "B06011_001",
state = "District of Columbia",
geometry = TRUE)
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
palette <- colorBin(palette = "Blues",
domain = DC_INCOME$estimate, bins=10)
DC_INCOME %>%
st_transform(crs = "+init=epsg:4326") %>% # transform in order to use leaflet function
leaflet(width = "100%") %>%
addProviderTiles(provider = "CartoDB.Positron") %>%
addPolygons(popup = ~GEOID,
color = "black",
weight = 1)%>%
addPolygons(popup = ~ str_extract(NAME, "^([^,]*)"),
stroke = FALSE,
smoothFactor = 0,
fillOpacity = 0.7,
color = ~ palette(estimate)) %>%
addLegend("bottomleft",
pal = palette,
values = ~ estimate,
title = "Median income - Tract",
labFormat = labelFormat(prefix = "$"),
opacity = 1)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette Blues is 9
## Returning the palette you asked for with that many colors
DC_NONHIS_BLACK <- get_acs(geography = "tract",
variables = "B03002_004",
state = "District of Columbia",
geometry = TRUE,
summary_var="B01001_001")
## Getting data from the 2017-2021 5-year ACS
## Downloading feature geometry from the Census website. To cache shapefiles for use in future sessions, set `options(tigris_use_cache = TRUE)`.
pal2 <- colorBin(palette = "BuGn",
domain = DC_NONHIS_BLACK$estimate/DC_NONHIS_BLACK$summary_est*100,
bins=10)
DC_NONHIS_BLACK %>%
st_transform(crs = "+init=epsg:4326") %>%
leaflet(width = "100%") %>%
addProviderTiles(provider = "CartoDB.Positron") %>%
addPolygons(popup = ~GEOID,
color = "black",
weight = 1)%>%
addPolygons(popup = ~ str_extract(NAME, "^([^,]*)"),
stroke = FALSE,
smoothFactor = 0,
fillOpacity = 0.7,
color = ~ pal2(estimate/summary_est*100)) %>%
addLegend("bottomleft",
pal = pal2,
values = ~ estimate/summary_est*100,
title = "Proportion of Non-Hispanic Blacks - Tract",
labFormat = labelFormat(suffix="%"),
opacity = 1)
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette BuGn is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette BuGn is 9
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(max(3, n), palette): n too large, allowed maximum for palette BuGn is 9
## Returning the palette you asked for with that many colors
Comparing the two median income graphs, we can see that areas with a higher percentage of non-Hispanic black residents tend to have lower median incomes.