This script creates a list of all different strata possibilities for the PESCA quota sampling project and then visualize the strata with Census information and deciphers which strata to condense.
see “Census Data Wrangling” for wrangling code
# Attach packages
library(tidyverse)
library(dplyr)
library(data.table)
library(kableExtra)
library(knitr)
library(readr)
library(plyr)
library(ggrepel)
library(caret)
library(GGally)
library(treemap)
library(gridExtra)
library(pmdplyr)
We had wanted strata based on:
This developed 380+ strata.
we have since changed quadrants, these quadrants are explored with Census data below
# Put the different strata (5 total) into their own vectors.
age <- c("18_44",
"45_64",
"65_and_above")
gender <- c("male",
"female")
race <- c("white_and_other",
"african_american",
"asian_american",
"hispanic")
education <- c("less_than_high_school_diploma",
"high_school_graduate_or_ged",
"some_college_or_associates_degree",
"bachelors_degree_or_higher")
location <- c("north_coast",
"inland_north",
"la_belt",
"southern_counties")
# Next I want to make a data frame including all of the combinations of the five strata.There should be 384 total combinations
strata_total <- data.frame( expand.grid(age,gender,race,education,location) )
setnames(strata_total,
old = c("Var1","Var2","Var3","Var4","Var5"),
new = c("age","gender","race","education","location"))
#strata_total_table <-
# kable(strata_total,
# caption = "Total Strata Table") %>%
#kable_styling("striped",
# full_width = F)
#strata_total_table
# This saves the total strata to csv format.
write.table(strata_total,
file = "strata__total.csv",
row.names = FALSE,
sep = ",")
#This next frame randomly chooses 25 of the 256 strata combinations and puts them into a table
#strata_sample <- sample_n(strata_total,
# 25,
# replace = FALSE)
#strata_sample_table <-
# kable(strata_sample,
# caption = "Sample Strata Table") %>%
# kable_styling("striped",
#full_width = F)
#strata_sample_table
#save_kable(strata_sample_table,
#file = "sample_strata_table.html")
#[WARNING: this will rewrite the file each time it is run]
# Write table to a csv file for viewing in Excel.
#write.table(strata_sample,
# file = "strata_sample.csv",
# row.names = FALSE,
# sep = ",")
These contain strata with quadrants developed July 19, I kept the original strata below for comparison
July19_strata <- read_csv("July19_strata.csv")
## Parsed with column specification:
## cols(
## race_sex = col_character(),
## age_range = col_character(),
## quadrant = col_character(),
## ID = col_character(),
## population = col_double()
## )
#View(July19_strata)
grouped_strata <- July19_strata %>%
group_by(race_sex, age_range, quadrant, ID) %>%
dplyr::summarise(population = sum(population)) %>%
mutate(percent_pop = population/sum(population)*100)
## `summarise()` regrouping output by 'race_sex', 'age_range', 'quadrant' (override with `.groups` argument)
#View(grouped_strata)
dim(grouped_strata) #96 strata
## [1] 96 6
range(grouped_strata$population) #16,871 is smallest
## [1] 16871 2661522
range(grouped_strata$percent_pop) #0.042% of pop is smallest
## [1] 0.04215751 6.65065109
mean(grouped_strata$population) #416,864.3
## [1] 416864.3
sd(grouped_strata$population) #499,355.5
## [1] 499355.5
##Strata w/ More than 0.1% Percent Pop: 88
morethan_40k <- grouped_strata %>%
filter(percent_pop > "0.1")
#View(morethan_40k)
dim(morethan_40k) #88 strata, removed 6 strata
## [1] 88 6
#########All strata less than 40k
lessthan_40k <- grouped_strata %>%
filter(percent_pop < "0.1")
head(lessthan_40k) #8 strata.
## # A tibble: 6 x 6
## # Groups: race_sex, age_range, quadrant [6]
## race_sex age_range quadrant ID population percent_pop
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 asian_fem… 65-85plus inland_so… |asian_female|.....|65… 35653 0.0891
## 2 asian_male 65-85plus inland_no… |asian_male|.......|65… 38392 0.0959
## 3 asian_male 65-85plus inland_so… |asian_male|.......|65… 27503 0.0687
## 4 black_fem… 65-85plus inland_no… |black_female|.....|65… 21757 0.0544
## 5 black_fem… 65-85plus inland_so… |black_female|.....|65… 28277 0.0707
## 6 black_male 65-85plus inland_no… |black_male|.......|65… 16871 0.0422
######All strata less than 60k, or .15%
lessthan_60k <- grouped_strata %>%
filter(percent_pop < "0.15")
#16 strata:
#View(lessthan_60k)
##All strata less than 80k, or .2%
lessthan_80k <- grouped_strata %>%
filter(percent_pop < "0.2") #strata with less than 0.2% of the population
#View(lessthan_80k)
#23 strata
#All strata less than 2.5%
lessthan_100k <- grouped_strata %>%
filter(percent_pop < "0.25")
#View(lessthan_100k)
dim(lessthan_100k) #32 strata
## [1] 32 6
#### What Do Age Groups Look Like?
##turns out the dplyr pre-ggplot here is unneccessary - wow I'm rusty!!
filter_age <- grouped_strata %>%
group_by(age_range) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
#View(filter_age)
plot2 <- ggplot(filter_age, aes(x = age_range, y=population, fill = age_range)) +
geom_bar(stat ="identity")
plot2
#### What Do Race_Sex Groups Look Like?
filter_race_sex <- grouped_strata %>%
group_by(race_sex) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot3 <- ggplot(filter_race_sex, aes(x = race_sex, y=population, fill = race_sex)) +
geom_bar(stat ="identity")
plot3
### Quadrants?
filter_quadrant <- grouped_strata %>%
group_by(quadrant) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot4 <- ggplot(filter_quadrant, aes(x = quadrant, y=population, fill = quadrant)) +
geom_bar(stat ="identity")
plot4
#### Spread of Percent Population by Race & Sex
plot1 <- ggplot(grouped_strata, aes(x = percent_pop, fill = race_sex)) +
geom_histogram(alpha=0.5, position="identity")
plot1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Spread of Population by Race & Sex By Quadrant
############# Inland North ###############
filter_quadrant <- grouped_strata %>%
filter(quadrant == "inland_north") %>%
group_by(race_sex) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot5 <- ggplot(filter_quadrant, aes(x = race_sex, y=population, fill = race_sex)) +
geom_bar(stat ="identity") +
ggtitle("Inland North")
plot5
############# Inland South ###############
filter_quadrantIS <- grouped_strata %>%
filter(quadrant == "inland_south") %>%
group_by(race_sex) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot6 <- ggplot(filter_quadrantIS, aes(x = race_sex, y=population, fill = race_sex)) +
geom_bar(stat ="identity") +
ggtitle("Inland South")
plot6
############# South Coast ###############
filter_quadrantSC <- grouped_strata %>%
filter(quadrant == "south_coast") %>%
group_by(race_sex) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot7 <- ggplot(filter_quadrantSC, aes(x = race_sex, y=population, fill = race_sex)) +
geom_bar(stat ="identity") +
ggtitle("South Coast")
plot7
############# South Coast ###############
filter_quadrantNC <- grouped_strata %>%
filter(quadrant == "north_coast") %>%
group_by(race_sex) %>%
dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot8 <- ggplot(filter_quadrantNC, aes(x = race_sex, y=population, fill = race_sex)) +
geom_bar(stat ="identity") +
ggtitle("North Coast")
plot8
70 Strata Total
39 strata being condensed
15 new strata
Will manually change strata because it is faster, for code that begin to merge, see below
July19_strata_condensed <- read_csv("July19_strata_condensed.csv")
## Parsed with column specification:
## cols(
## race_sex = col_character(),
## age_range = col_character(),
## quadrant = col_character(),
## ID = col_character(),
## population = col_double()
## )
final_cond_strata <- July19_strata_condensed%>%
mutate(ID = id_variable(race_sex, age_range, quadrant, .method = "character"))
#View(final_cond_strata)
#write.csv(final_cond_strata, file.path(path, "final_cond_strata.csv"), row.names=FALSE)
#Starta in general
mean(final_cond_strata$population) # 571,699.6 - significantly larger mean than for pre-condensed
## [1] 571699.6
min(final_cond_strata$population) # 109,622
## [1] 109622
max(final_cond_strata$population) #661,522
## [1] 2661522
sd(final_cond_strata$population) #511,083.3
## [1] 511083.3
##Exploratory stats for condensed
cond_vs_not <- final_cond_strata %>%
filter(age_range == c("20_85plus", "35-85plus"))
mean(cond_vs_not$population) #20-85plus: 183890.9 w/ 35-85 plus: 204577.5,
## [1] 204577.5
sd(cond_vs_not$population) #65536.05
## [1] 65536.05
cond_vs_not1 <- final_cond_strata %>%
filter(age_range == c("20_85plus"))
mean(cond_vs_not1$population)
## [1] 183890.9
sd(cond_vs_not1$population) #49176.26
## [1] 49176.26
cond_vs_not2 <- final_cond_strata %>%
filter(age_range == "35-85plus")
mean(cond_vs_not2$population) #274540.5 - still a lot smaller than mean
## [1] 274540.5
sd(cond_vs_not2$population) #81473.55
## [1] 81473.55
###NON-condensed
cond_vs_not3 <- final_cond_strata %>%
filter(age_range == c("20_34", "35-64", "68-85plus"))
## Warning in age_range == c("20_34", "35-64", "68-85plus"): longer object length
## is not a multiple of shorter object length
mean(cond_vs_not3$population) #626,643.9
## [1] 626643.9
sd(cond_vs_not3$population) #386,014.9
## [1] 386014.9
final_strata <- read_csv("final_strata.csv")
## Parsed with column specification:
## cols(
## ID = col_character(),
## age_range = col_character(),
## race_sex = col_character(),
## quadrant = col_character(),
## population = col_double()
## )
#View(final_strata)
#This has unique ID, age range, race_sex, quadrant, population size
dim(final_strata) #96 rows, 5 columns
## [1] 96 5
sum(final_strata$population) #40,018,969
## [1] 40018969
strata_relative_size <- final_strata %>%
mutate(percent_pop = population/sum(population)*100)
head(strata_relative_size)
## # A tibble: 6 x 6
## ID age_range race_sex quadrant population percent_pop
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 20_AF_IN 20_34 asian_female inland_north 94146 0.235
## 2 20_AF_LB 20_34 asian_female la_belt 186337 0.466
## 3 20_AF_NC 20_34 asian_female north_coast 249132 0.623
## 4 20_AF_SC 20_34 asian_female southern_counties 155792 0.389
## 5 20_AM_IN 20_34 asian_male inland_north 92458 0.231
## 6 20_AM_LB 20_34 asian_male la_belt 183186 0.458
range(strata_relative_size$population)
## [1] 21408 1875201
#View(strata_relative_size)
#path <- "/Users/phoeberacine/seafoodmarkets/strata_creation_code"
#write.csv(strata_relative_size, file.path(path, "total_strata_size.csv"), row.names=FALSE)
6 strata: less than 0.1% or 40,000
12 strata: less than 0.1% or 60,000, this adds:
15 strata: less than 0.2% or 80,000, this adds:
27 strata: less than 0.25% or 100,000, this adds: * Asian women: 20-24, 65+ Inland North * Hispanic male: 65+ North coast, Inland North * Black women: 20-34 Southern Counties, 35-64 Inland North, 65+ LA Belt * Black men: 35-64 North coast, 35-64 Inland North, 20-34 Southern Counties * Asian men: 65+ southern counties, 20-24 Inland North * Hispanic women: 65+ Inland North, North Coast
##Removing rows with less than 0.1% or 40,000 should we need it
morethan_40k_OG <- strata_relative_size %>%
filter(percent_pop > "0.1")
#View(lessthan_40k)
dim(morethan_40k_OG) #90 strata, removed 6 strata
## [1] 90 6
##All strata less than 40k
lessthan_40k_OG <- strata_relative_size %>%
filter(percent_pop < "0.1")
head(lessthan_40k_OG) #all 6 small strata. These strata are all Black above the age of 65. There is an even spread of Black men and women in north coast and inland north. Then we would remove black women in southern counties, and black men in the north coast.
## # A tibble: 6 x 6
## ID age_range race_sex quadrant population percent_pop
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 65_BF_IN 65-85plus black_female inland_north 27996 0.0700
## 2 65_BF_NC 65-85plus black_female north_coast 37289 0.0932
## 3 65_BF_SC 65-85plus black_female southern_counties 38393 0.0959
## 4 65_BM_IN 65-85plus black_male inland_north 21408 0.0535
## 5 65_BM_NC 65-85plus black_male north_coast 28262 0.0706
## 6 65_BM_SC 65-85plus black_male southern_counties 30711 0.0767
##All strata less than 60k, or .15%
lessthan_60k_OG <- strata_relative_size %>%
filter(percent_pop < "0.15")
#12 strata, the new 6 are: Black men and women aged 25-34 in the Inland North and North Coast, Asian men in Inland north, and Black Men above age 65 in the LA Belt
#View(lessthan_60k)
##All strata less than 80k, or .2%
lessthan_80k_OG <- strata_relative_size %>%
filter(percent_pop < "0.2") #strata with less than 0.2% of the population
#View(lessthan_80k)
#All strata less than 2.5%
lessthan_100k_OG <- strata_relative_size %>%
filter(percent_pop < "0.25")
#View(lessthan_100k)
dim(lessthan_100k_OG) #27 strata
## [1] 27 6
#Need to merge 27
Black Men * merge 4 age: 65+, race_sex: Black men, geographies:all * merge 3: 20-34, race_sex: Black men, geographies: all but LA Belt * merge 2: 35-64, race_sex: Black men, geographies: Inland North & North Coast
Black Women * merge 4: age: 65+, race_sex: Black women, geographies: all * merge 2: age 35-64, race_sex: Black women, geographies: inland north, north coast * merge 3: age 20-34, race_sex: Black women, geographies: inland north, north coast, southern counties
Hispanic Women * merge 2: 65+ Hispanic women: north coast & inland north
Hispanic Men * merge 2: 65+ Hispanic men: north coast & inland north
Asian folks above 65+ * merge 2: 65+ Asian men & women, Inland North
Asian folks 20-34 * merge 2: 20-34 Asian men & women, Inland North
Asian men * merge 2: 34-64, 65+ Asian men: Southern Counties
#Need to merge 27
Black Men * merge 4 age: 65+, race_sex: Black men, geographies:all * merge 3: 20-34, race_sex: Black men, geographies: all but LA Belt * merge 2: 35-64, race_sex: Black men, geographies: Inland North & North Coast
Black Women * merge 4: age: 65+, race_sex: Black women, geographies: all * merge 2: age 35-64, race_sex: Black women, geographies: inland north, north coast * merge 3: age 20-34, race_sex: Black women, geographies: inland north, north coast, southern counties
Hispanic Women * merge 2: 65+ Hispanic women: north coast & inland north
Hispanic Men * merge 2: 65+ Hispanic men: north coast & inland north
Asian folks above 65+ - change this, merge geographies if I think that’s what matters * merge 2: 65+ Asian men & women, Inland North, Southern Counties * merge 2: 65+ Asian men: men & women, Southern Counties
Asian folks 34-64 * merge 2: 34-64, men and women, Southern Counties
Asian folks 20-34 * merge 2: 20-34 Asian men & women, Inland North
Guidance: - https://markhneedham.com/blog/2015/06/27/r-dplyr-squashing-multiple-rows-per-group-into-one/ - https://stackoverflow.com/questions/39092110/r-dplyr-summarize-and-retain-other-columns
####Code: * first we’re condensing rows in the unique ways they need to be condensed using summarise * next, we’re bringing new rows together
#Need to merge 27, 29 total
####Black Men
#merge 4 age: 65+, race_sex: Black men, geographies: all
#this will filter out the strata of interest, then using summarise we can create new column values
BM_65_ALL <- strata_relative_size %>%
filter(age_range == "65-85plus", race_sex == "black_male") %>% #we need to filter since we need population to sum between particular rows
group_by(age_range, race_sex) %>% #group_by pretty much always goes before summarise
summarise(ID = first("65_BM_ALL"), age_range = first("65-85plus"), race_sex = "black_male",
quadrant = paste(quadrant, collapse = ", "), population = sum(population)) #first() allows you to create a new row value, you can also put a column name in there if you want to same values
#View(BM_65_ALL)
##merge 3: 20-34, race_sex: Black men, geographies: all but LA Belt
BM_20_IN_NC_SC <- strata_relative_size %>%
filter(age_range == "20_34", race_sex == "black_male",
quadrant %in% c("inland_north", "north_coast", "southern_counties")) %>%
group_by(age_range, race_sex) %>%
summarise(ID = first("20_BM_IN_NC_SC"), age_range = first("20_34"), race_sex = "black_male",
quadrant = paste(quadrant, collapse = ", "), population = sum(population))
#View(BM_20_IN_NC_SC)
#merge 2: 35-64, race_sex: Black men, geographies: Inland North & North Coast
BM_35_IN_NC <- strata_relative_size %>%
filter(age_range == "35-64", race_sex == "black_male",quadrant %in% c("inland_north", "north_coast")) %>%
group_by(age_range, race_sex) %>%
summarise(ID = first("20_BM_IN_NC"), age_range = first("35_64"), race_sex = "black_male",
quadrant = paste(quadrant, collapse = ", "), population = sum(population))
#View(BM_35_IN_NC)
#Black Women
##merge 4: age: 65+, race_sex: Black women, geographies: all
##merge 2: age 35-64, race_sex: Black women, geographies: inland north, north coast
##merge 3: age 20-34, race_sex: Black women, geographies: inland north, north coast, southern counties
#Hispanic Women
##merge 2: 65+ Hispanic women: north coast & inland north
#Hispanic Men
##merge 2: 65+ Hispanic men: north coast & inland north
#Asian folks above 65+
## merge 2: 65+ Asian men & women, Inland North
## merge 2: 65+ Asian men: men & women, Southern Counties
#Asian folks 34-64
## merge 2: 34-64, men and women, Southern Counties
#Asian folks 20-34
# merge 2: 20-34 Asian men & women, Inland North
condensed_strata <- read_csv("condensed_strata.csv") %>%
mutate(percent_pop = population/sum(population)*100)
## Parsed with column specification:
## cols(
## ID = col_character(),
## age_range = col_character(),
## race_sex = col_character(),
## quadrant = col_character(),
## population = col_double()
## )
head(condensed_strata)
## # A tibble: 6 x 6
## ID age_range race_sex quadrant population percent_pop
## <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 20_AM_AF_IN 20_34 asian_male_fema… inland_north 186604 0.465
## 2 20_AF_LB 20_34 asian_female la_belt 186337 0.465
## 3 20_AF_NC 20_34 asian_female north_coast 249132 0.621
## 4 20_AF_SC 20_34 asian_female southern_counti… 155792 0.389
## 5 20_AM_LB 20_34 asian_male la_belt 183186 0.457
## 6 20_AM_NC 20_34 asian_male north_coast 254296 0.634
ggplot(strata_relative_size, aes(population)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(strata_relative_size, aes(x = percent_pop, fill = race_sex)) +
geom_histogram(alpha=0.5, position="identity")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- ggplot(strata_relative_size, aes(x = population, color =race_sex)) +
geom_freqpoly() +
scale_x_log10(breaks = c(50, 150, 400, 750) * 1000)
p2 <- ggplot(strata_relative_size, aes(x = population, color =race_sex, fill = race_sex)) +
geom_density(alpha = .15) +
scale_x_log10(breaks = c(50, 150, 400, 750) * 1000)
gridExtra::grid.arrange(p1, p2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##Boxplot by California geography
ggplot(data = strata_relative_size, mapping = aes(x = quadrant, y = population)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, aes(color = race_sex))
#Boxplot by California age ranges
ggplot(data = strata_relative_size, mapping = aes(x = age_range, y = population)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, aes(color = race_sex))
NOTE: Why are "Whiteother Male and White Other Female showing up multiple times?? * oh!!! age group!
la_belt_plot1 <- strata_relative_size %>%
select(-age_range, -ID) %>%
filter(quadrant == "la_belt") %>%
ggplot(mapping = aes(x = quadrant, y = population)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, aes(color = race_sex)) +
#geom_text(aes(label=race_sex),hjust=0, vjust=0) +
geom_label_repel(aes(label = race_sex),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()
la_belt_plot1
north_coast_plot1 <- strata_relative_size %>%
filter(quadrant == "north_coast") %>%
ggplot(mapping = aes(x = quadrant, y = population)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, aes(color = race_sex)) +
#geom_text(aes(label=race_sex),hjust=0, vjust=0) +
geom_label_repel(aes(label = race_sex),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()
north_coast_plot1
southern_counties_plot1 <- strata_relative_size %>%
filter(quadrant == "southern_counties") %>%
ggplot(mapping = aes(x = quadrant, y = population)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, aes(color = race_sex)) +
#geom_text(aes(label=race_sex),hjust=0, vjust=0) +
geom_label_repel(aes(label = race_sex),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()
southern_counties_plot1
inland_north_plot1 <- strata_relative_size %>%
filter(quadrant == "inland_north") %>%
ggplot(mapping = aes(x = quadrant, y = population)) +
geom_boxplot(alpha = 0) +
geom_jitter(alpha = 0.3, aes(color = race_sex)) +
#geom_text(aes(label=race_sex),hjust=0, vjust=0) +
geom_label_repel(aes(label = race_sex),
box.padding = 0.35,
point.padding = 0.5,
segment.color = 'grey50') +
theme_classic()
inland_north_plot1