Script Overview

This script creates a list of all different strata possibilities for the PESCA quota sampling project and then visualize the strata with Census information and deciphers which strata to condense.

see “Census Data Wrangling” for wrangling code

Note about document:

  • as of July 17: we dropped education due to difficult getting cross section of ACS 5 Yr
  • one problem with this is we have to start at age 20 because of the ways things were grouped
  • changes not reflected in beginning chunks, but reflected in strata visualization below
  • As of July 19: we decided to create our own quadrants according to coastal vs inland
# Attach packages
library(tidyverse)
library(dplyr)
library(data.table)
library(kableExtra)
library(knitr)
library(readr)
library(plyr)
library(ggrepel)
library(caret)
library(GGally)
library(treemap)
library(gridExtra)
library(pmdplyr)

Strata Goals

We had wanted strata based on:

This developed 380+ strata.

Initial Strata Exploration

we have since changed quadrants, these quadrants are explored with Census data below

# Put the different strata (5 total) into their own vectors.

age <- c("18_44",
         "45_64",
         "65_and_above")
gender <- c("male",
            "female")
race <- c("white_and_other",
          "african_american",  
          "asian_american",
          "hispanic")
education <- c("less_than_high_school_diploma",
               "high_school_graduate_or_ged",
               "some_college_or_associates_degree",
               "bachelors_degree_or_higher")
location <- c("north_coast",
              "inland_north",
              "la_belt",
              "southern_counties")


# Next I want to make a data frame including all of the combinations of the five strata.There should be 384 total combinations

strata_total <- data.frame( expand.grid(age,gender,race,education,location) )

setnames(strata_total,
         old = c("Var1","Var2","Var3","Var4","Var5"),
         new = c("age","gender","race","education","location"))

#strata_total_table <- 
 # kable(strata_total,
  #      caption = "Total Strata Table") %>% 
  #kable_styling("striped",
   #             full_width = F)


#strata_total_table
# This saves the total strata  to csv format.

write.table(strata_total,
           file = "strata__total.csv",
           row.names = FALSE,
           sep = ",")
#This next frame randomly chooses 25 of the 256 strata combinations and puts them into a table


#strata_sample <- sample_n(strata_total,
                      #  25,
                       # replace = FALSE)

#strata_sample_table <- 
 # kable(strata_sample,
   #     caption = "Sample Strata Table") %>% 
 # kable_styling("striped",
                #full_width = F)


#strata_sample_table
#save_kable(strata_sample_table,
           #file = "sample_strata_table.html")

#[WARNING: this will rewrite the file each time it is run]
# Write table to a csv file for viewing in Excel.  

#write.table(strata_sample,
       #    file = "strata_sample.csv",
        #   row.names = FALSE,
         #  sep = ",")

Adding Census Data

These contain strata with quadrants developed July 19, I kept the original strata below for comparison

July19_strata <- read_csv("July19_strata.csv")
## Parsed with column specification:
## cols(
##   race_sex = col_character(),
##   age_range = col_character(),
##   quadrant = col_character(),
##   ID = col_character(),
##   population = col_double()
## )
#View(July19_strata)

grouped_strata <- July19_strata %>% 
  group_by(race_sex, age_range, quadrant, ID) %>% 
  dplyr::summarise(population = sum(population)) %>% 
    mutate(percent_pop = population/sum(population)*100)
## `summarise()` regrouping output by 'race_sex', 'age_range', 'quadrant' (override with `.groups` argument)
#View(grouped_strata)

Exploring Size of Quadrants

dim(grouped_strata) #96 strata
## [1] 96  6
range(grouped_strata$population) #16,871 is smallest
## [1]   16871 2661522
range(grouped_strata$percent_pop) #0.042% of pop is smallest
## [1] 0.04215751 6.65065109
mean(grouped_strata$population) #416,864.3
## [1] 416864.3
sd(grouped_strata$population) #499,355.5
## [1] 499355.5
##Strata w/ More than 0.1% Percent Pop: 88
morethan_40k <- grouped_strata %>% 
  filter(percent_pop > "0.1") 

#View(morethan_40k)
dim(morethan_40k) #88 strata, removed 6 strata
## [1] 88  6
#########All strata less than 40k
lessthan_40k <- grouped_strata %>% 
  filter(percent_pop < "0.1") 

head(lessthan_40k) #8 strata.
## # A tibble: 6 x 6
## # Groups:   race_sex, age_range, quadrant [6]
##   race_sex   age_range quadrant   ID                      population percent_pop
##   <chr>      <chr>     <chr>      <chr>                        <dbl>       <dbl>
## 1 asian_fem… 65-85plus inland_so… |asian_female|.....|65…      35653      0.0891
## 2 asian_male 65-85plus inland_no… |asian_male|.......|65…      38392      0.0959
## 3 asian_male 65-85plus inland_so… |asian_male|.......|65…      27503      0.0687
## 4 black_fem… 65-85plus inland_no… |black_female|.....|65…      21757      0.0544
## 5 black_fem… 65-85plus inland_so… |black_female|.....|65…      28277      0.0707
## 6 black_male 65-85plus inland_no… |black_male|.......|65…      16871      0.0422
######All strata less than 60k, or .15%
lessthan_60k <- grouped_strata %>% 
  filter(percent_pop < "0.15") 

#16 strata:

#View(lessthan_60k)

##All strata less than 80k, or .2%
lessthan_80k <- grouped_strata %>% 
  filter(percent_pop < "0.2") #strata with less than 0.2% of the population

#View(lessthan_80k) 

#23 strata

#All strata less than 2.5%
lessthan_100k <- grouped_strata %>% 
  filter(percent_pop < "0.25")
#View(lessthan_100k)
dim(lessthan_100k) #32 strata
## [1] 32  6

Strata Exploration

#### What Do Age Groups Look Like?

##turns out the dplyr pre-ggplot here is unneccessary - wow I'm rusty!!
filter_age <- grouped_strata %>% 
  group_by(age_range) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
#View(filter_age)  

plot2 <- ggplot(filter_age, aes(x = age_range, y=population, fill = age_range)) +
  geom_bar(stat ="identity") 
plot2

#### What Do Race_Sex Groups Look Like?
filter_race_sex <- grouped_strata %>% 
  group_by(race_sex) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot3 <- ggplot(filter_race_sex, aes(x = race_sex, y=population, fill = race_sex)) +
  geom_bar(stat ="identity") 
plot3

### Quadrants?
filter_quadrant <- grouped_strata %>% 
  group_by(quadrant) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot4 <- ggplot(filter_quadrant, aes(x = quadrant, y=population, fill = quadrant)) +
  geom_bar(stat ="identity") 
plot4

#### Spread of Percent Population by Race & Sex
plot1 <- ggplot(grouped_strata, aes(x = percent_pop, fill = race_sex)) +
  geom_histogram(alpha=0.5, position="identity") 

plot1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Population By Quandrant & Race_sex

  • Inland North has populations under 270,000 for: Asian women and men, Black women and men. 1.5 million for Hispanic women and men
  • Inland South has populations under 250,000 for: Asian women and men, Black women and men
  • South Coast has populations under 500,000 for: Black women and men
  • North Coast has populations under 250,000 for: Black women and men
##Spread of Population by Race & Sex By Quadrant

############# Inland North ###############
filter_quadrant <- grouped_strata %>% 
  filter(quadrant == "inland_north") %>% 
  group_by(race_sex) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot5 <- ggplot(filter_quadrant, aes(x = race_sex, y=population, fill = race_sex)) +
  geom_bar(stat ="identity") +
  ggtitle("Inland North")
plot5

############# Inland South ###############
filter_quadrantIS <- grouped_strata %>% 
  filter(quadrant == "inland_south") %>% 
  group_by(race_sex) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot6 <- ggplot(filter_quadrantIS, aes(x = race_sex, y=population, fill = race_sex)) +
  geom_bar(stat ="identity") +
  ggtitle("Inland South")
plot6

############# South Coast ###############
filter_quadrantSC <- grouped_strata %>% 
  filter(quadrant == "south_coast") %>% 
  group_by(race_sex) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot7 <- ggplot(filter_quadrantSC, aes(x = race_sex, y=population, fill = race_sex)) +
  geom_bar(stat ="identity") +
  ggtitle("South Coast")
plot7

############# South Coast ###############
filter_quadrantNC <- grouped_strata %>% 
  filter(quadrant == "north_coast") %>% 
  group_by(race_sex) %>% 
  dplyr::summarise(population = sum(population))
## `summarise()` ungrouping output (override with `.groups` argument)
plot8 <- ggplot(filter_quadrantNC, aes(x = race_sex, y=population, fill = race_sex)) +
  geom_bar(stat ="identity") +
  ggtitle("North Coast")
plot8

Condensing Strata

  • Condensing populations below 100,000 or 0.25%
  • Condensing by age within areas
  • A few strata larger than 100,000 were merged in order to make age group around them large enough
    • these include: BF_NC_34_65, BM_NC_34_65, HF_NC_35-64, HM_NC_35-64, HF_IN_35-64, HM_NC_35-64, HM_IN_35-64, AF_35_65_IS, AM_35_65_IS

70 Strata Total

39 strata being condensed

15 new strata

  • ADDED:Merge 3: 20-65+ Black Men: Inland North
  • ADDED:Merge 3: 20-65+ Black Men: Inland South
  • ADDED:Merge 2: 34-65+ Black Men: South Coast
  • ADDED:Merge 3: 20-65+ Black Men: North Coast
  • ADDEDMerge 3: 20-65+ Black Women: Inland North
  • ADDEDMerge 3: 20-65+ Black Women: Inland South
  • ADDEDMerge 3: 20-65+ Black Women: North Coast
  • ADDEDMerge 2: 34-65+ Black Women: South Coast
  • ADDED: Merge 3: 20-65+ Asian Men: Inland South
  • ADDED:Merge 3: 20-65+ Asian Women: Inland North
  • ADDED:Merge 3: 20-65+ Asian Men: Inland North
  • ADDED:Merge 2: 34-65+ Hispanic Women: Inland North
  • ADDED:Merge 2: 34-65+ Hispanic Women: North Coast
  • ADDEDMerge 2: 34-65+ Hispanic Men: Inland North
  • ADDEDMerge 2: 34-65+ Hispanic Men: North Coast

Will manually change strata because it is faster, for code that begin to merge, see below

July19_strata_condensed <- read_csv("July19_strata_condensed.csv")
## Parsed with column specification:
## cols(
##   race_sex = col_character(),
##   age_range = col_character(),
##   quadrant = col_character(),
##   ID = col_character(),
##   population = col_double()
## )
final_cond_strata <- July19_strata_condensed%>% 
  mutate(ID = id_variable(race_sex, age_range, quadrant, .method = "character"))

#View(final_cond_strata)

#write.csv(final_cond_strata, file.path(path, "final_cond_strata.csv"), row.names=FALSE)

Exploring New Quandrant & Condensed Strata

  • Those condensed had a mean=204,577. and an sd=65,536.05
  • Those grouped 35-85+ had a mean of 274540.5, still a lot smaller than mean
  • Those grouped 20-65+ had a mean of 183890.9
  • Removing the condensed column, the mean=626,643.9
  • I feel comfortable in the size of our grouping
#Starta in general
mean(final_cond_strata$population) # 571,699.6 - significantly larger mean than for pre-condensed
## [1] 571699.6
min(final_cond_strata$population) # 109,622
## [1] 109622
max(final_cond_strata$population) #661,522
## [1] 2661522
sd(final_cond_strata$population) #511,083.3
## [1] 511083.3
##Exploratory stats for condensed
cond_vs_not <- final_cond_strata %>% 
  filter(age_range == c("20_85plus", "35-85plus")) 

mean(cond_vs_not$population) #20-85plus: 183890.9 w/ 35-85 plus: 204577.5, 
## [1] 204577.5
sd(cond_vs_not$population) #65536.05
## [1] 65536.05
cond_vs_not1 <- final_cond_strata %>% 
  filter(age_range == c("20_85plus")) 

mean(cond_vs_not1$population)
## [1] 183890.9
sd(cond_vs_not1$population) #49176.26
## [1] 49176.26
cond_vs_not2 <- final_cond_strata %>% 
  filter(age_range == "35-85plus") 

mean(cond_vs_not2$population) #274540.5 - still a lot smaller than mean
## [1] 274540.5
sd(cond_vs_not2$population) #81473.55
## [1] 81473.55
###NON-condensed
cond_vs_not3 <- final_cond_strata %>% 
  filter(age_range == c("20_34", "35-64", "68-85plus")) 
## Warning in age_range == c("20_34", "35-64", "68-85plus"): longer object length
## is not a multiple of shorter object length
mean(cond_vs_not3$population) #626,643.9
## [1] 626643.9
sd(cond_vs_not3$population) #386,014.9
## [1] 386014.9

Orginal Strata with Quadrants from State

final_strata <- read_csv("final_strata.csv")
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   age_range = col_character(),
##   race_sex = col_character(),
##   quadrant = col_character(),
##   population = col_double()
## )
#View(final_strata)
#This has unique ID, age range, race_sex, quadrant, population size

dim(final_strata) #96 rows, 5 columns
## [1] 96  5

Creating a new column of percent of population size by 96 strata

sum(final_strata$population) #40,018,969
## [1] 40018969
strata_relative_size <- final_strata %>% 
  mutate(percent_pop = population/sum(population)*100)

head(strata_relative_size)
## # A tibble: 6 x 6
##   ID       age_range race_sex     quadrant          population percent_pop
##   <chr>    <chr>     <chr>        <chr>                  <dbl>       <dbl>
## 1 20_AF_IN 20_34     asian_female inland_north           94146       0.235
## 2 20_AF_LB 20_34     asian_female la_belt               186337       0.466
## 3 20_AF_NC 20_34     asian_female north_coast           249132       0.623
## 4 20_AF_SC 20_34     asian_female southern_counties     155792       0.389
## 5 20_AM_IN 20_34     asian_male   inland_north           92458       0.231
## 6 20_AM_LB 20_34     asian_male   la_belt               183186       0.458
range(strata_relative_size$population)
## [1]   21408 1875201
#View(strata_relative_size)

#path <- "/Users/phoeberacine/seafoodmarkets/strata_creation_code"
#write.csv(strata_relative_size, file.path(path, "total_strata_size.csv"), row.names=FALSE)

Should we condense any strata?

6 strata: less than 0.1% or 40,000

  • these strata are Black above the age of 65.
  • Black men and women in inland north
  • Black women in southern counties
  • Black men in the north coast

12 strata: less than 0.1% or 60,000, this adds:

  • Black men and women aged 25-34 in the Inland North and North Coast
  • Asian men in Inland north
  • Black Men above age 65 in the LA Belt

15 strata: less than 0.2% or 80,000, this adds:

  • Asian women 65+ in the Inland North
  • Hispanic men 65+ North Coast
  • Black women 20-34 inland counties

27 strata: less than 0.25% or 100,000, this adds: * Asian women: 20-24, 65+ Inland North * Hispanic male: 65+ North coast, Inland North * Black women: 20-34 Southern Counties, 35-64 Inland North, 65+ LA Belt * Black men: 35-64 North coast, 35-64 Inland North, 20-34 Southern Counties * Asian men: 65+ southern counties, 20-24 Inland North * Hispanic women: 65+ Inland North, North Coast

##Removing rows with less than 0.1% or 40,000 should we need it
morethan_40k_OG <- strata_relative_size %>% 
  filter(percent_pop > "0.1") 

#View(lessthan_40k)
dim(morethan_40k_OG) #90 strata, removed 6 strata
## [1] 90  6
##All strata less than 40k
lessthan_40k_OG <- strata_relative_size %>% 
  filter(percent_pop < "0.1") 

head(lessthan_40k_OG) #all 6 small strata. These strata are all Black above the age of 65. There is an even spread of Black men and women in north coast and inland north. Then we would remove black women in southern counties, and black men in the north coast. 
## # A tibble: 6 x 6
##   ID       age_range race_sex     quadrant          population percent_pop
##   <chr>    <chr>     <chr>        <chr>                  <dbl>       <dbl>
## 1 65_BF_IN 65-85plus black_female inland_north           27996      0.0700
## 2 65_BF_NC 65-85plus black_female north_coast            37289      0.0932
## 3 65_BF_SC 65-85plus black_female southern_counties      38393      0.0959
## 4 65_BM_IN 65-85plus black_male   inland_north           21408      0.0535
## 5 65_BM_NC 65-85plus black_male   north_coast            28262      0.0706
## 6 65_BM_SC 65-85plus black_male   southern_counties      30711      0.0767
##All strata less than 60k, or .15%
lessthan_60k_OG <- strata_relative_size %>% 
  filter(percent_pop < "0.15") 

#12 strata, the new 6 are: Black men and women aged 25-34 in the Inland North and North Coast, Asian men in Inland north, and Black Men above age 65 in the LA Belt

#View(lessthan_60k) 

##All strata less than 80k, or .2%
lessthan_80k_OG <- strata_relative_size %>% 
  filter(percent_pop < "0.2") #strata with less than 0.2% of the population

#View(lessthan_80k)

#All strata less than 2.5%
lessthan_100k_OG <- strata_relative_size %>% 
  filter(percent_pop < "0.25")
#View(lessthan_100k)
dim(lessthan_100k_OG) #27 strata
## [1] 27  6

Methods for Condensing Strata

  • METHODS: do not collapse age or ethnicity, collapse people with the same age, sex, ethnicity by grouping geographies, when this is not possible within smaller strata - collapse next by sex
  • Ages is difficult in that there are 3 and there can be a gap
  • it’s also nice to mostly group neighboring geographic
  • the question is: what is the largest driver? my guess is geography and age
  • the reason why geography here might not be incredible helpful & is thus mergeable is because only one quadrant is on the coast, would have been nice to separate coastal counties from non-coastal

Strata Being Condensed

#Need to merge 27

Black Men * merge 4 age: 65+, race_sex: Black men, geographies:all * merge 3: 20-34, race_sex: Black men, geographies: all but LA Belt * merge 2: 35-64, race_sex: Black men, geographies: Inland North & North Coast

Black Women * merge 4: age: 65+, race_sex: Black women, geographies: all * merge 2: age 35-64, race_sex: Black women, geographies: inland north, north coast * merge 3: age 20-34, race_sex: Black women, geographies: inland north, north coast, southern counties

Hispanic Women * merge 2: 65+ Hispanic women: north coast & inland north

Hispanic Men * merge 2: 65+ Hispanic men: north coast & inland north

Asian folks above 65+ * merge 2: 65+ Asian men & women, Inland North

Asian folks 20-34 * merge 2: 20-34 Asian men & women, Inland North

Asian men * merge 2: 34-64, 65+ Asian men: Southern Counties

#Need to merge 27

Black Men * merge 4 age: 65+, race_sex: Black men, geographies:all * merge 3: 20-34, race_sex: Black men, geographies: all but LA Belt * merge 2: 35-64, race_sex: Black men, geographies: Inland North & North Coast

Black Women * merge 4: age: 65+, race_sex: Black women, geographies: all * merge 2: age 35-64, race_sex: Black women, geographies: inland north, north coast * merge 3: age 20-34, race_sex: Black women, geographies: inland north, north coast, southern counties

Hispanic Women * merge 2: 65+ Hispanic women: north coast & inland north

Hispanic Men * merge 2: 65+ Hispanic men: north coast & inland north

Asian folks above 65+ - change this, merge geographies if I think that’s what matters * merge 2: 65+ Asian men & women, Inland North, Southern Counties * merge 2: 65+ Asian men: men & women, Southern Counties

Asian folks 34-64 * merge 2: 34-64, men and women, Southern Counties

Asian folks 20-34 * merge 2: 20-34 Asian men & women, Inland North

Guidance: - https://markhneedham.com/blog/2015/06/27/r-dplyr-squashing-multiple-rows-per-group-into-one/ - https://stackoverflow.com/questions/39092110/r-dplyr-summarize-and-retain-other-columns

####Code: * first we’re condensing rows in the unique ways they need to be condensed using summarise * next, we’re bringing new rows together

#Need to merge 27, 29 total

####Black Men
#merge 4 age: 65+, race_sex: Black men, geographies: all

#this will filter out the strata of interest, then using summarise we can create new column values
BM_65_ALL <- strata_relative_size %>% 
  filter(age_range == "65-85plus", race_sex == "black_male") %>% #we need to filter since we need population to sum between particular rows
  group_by(age_range, race_sex) %>% #group_by pretty much always goes before summarise
  summarise(ID = first("65_BM_ALL"), age_range = first("65-85plus"), race_sex = "black_male", 
            quadrant = paste(quadrant, collapse = ", "), population = sum(population)) #first() allows you to create a new row value, you can also put a column name in there if you want to same values

#View(BM_65_ALL)
                                                                                                              
##merge 3: 20-34, race_sex: Black men, geographies: all but LA Belt
BM_20_IN_NC_SC <- strata_relative_size %>% 
  filter(age_range == "20_34", race_sex == "black_male", 
         quadrant %in% c("inland_north", "north_coast", "southern_counties")) %>% 
  group_by(age_range, race_sex) %>% 
  summarise(ID = first("20_BM_IN_NC_SC"), age_range = first("20_34"), race_sex = "black_male", 
            quadrant = paste(quadrant, collapse = ", "), population = sum(population))

#View(BM_20_IN_NC_SC)

#merge 2: 35-64, race_sex: Black men, geographies: Inland North & North Coast
BM_35_IN_NC <- strata_relative_size %>% 
  filter(age_range == "35-64", race_sex == "black_male",quadrant %in% c("inland_north", "north_coast")) %>% 
  group_by(age_range, race_sex) %>% 
  summarise(ID = first("20_BM_IN_NC"), age_range = first("35_64"), race_sex = "black_male", 
            quadrant = paste(quadrant, collapse = ", "), population = sum(population))
#View(BM_35_IN_NC)

#Black Women
##merge 4: age: 65+, race_sex: Black women, geographies: all 

##merge 2: age 35-64, race_sex: Black women, geographies: inland north, north coast

##merge 3: age 20-34, race_sex: Black women, geographies: inland north, north coast, southern counties

#Hispanic Women
##merge 2: 65+ Hispanic women: north coast & inland north

#Hispanic Men
##merge 2: 65+ Hispanic men: north coast & inland north


#Asian folks above 65+
## merge 2: 65+ Asian men & women, Inland North

## merge 2: 65+ Asian men: men & women, Southern Counties

#Asian folks 34-64
## merge 2: 34-64, men and women, Southern Counties

#Asian folks 20-34
# merge 2: 20-34 Asian men & women, Inland North

Condensed Strata

condensed_strata <- read_csv("condensed_strata.csv") %>% 
   mutate(percent_pop = population/sum(population)*100)
## Parsed with column specification:
## cols(
##   ID = col_character(),
##   age_range = col_character(),
##   race_sex = col_character(),
##   quadrant = col_character(),
##   population = col_double()
## )
head(condensed_strata)
## # A tibble: 6 x 6
##   ID          age_range race_sex         quadrant         population percent_pop
##   <chr>       <chr>     <chr>            <chr>                 <dbl>       <dbl>
## 1 20_AM_AF_IN 20_34     asian_male_fema… inland_north         186604       0.465
## 2 20_AF_LB    20_34     asian_female     la_belt              186337       0.465
## 3 20_AF_NC    20_34     asian_female     north_coast          249132       0.621
## 4 20_AF_SC    20_34     asian_female     southern_counti…     155792       0.389
## 5 20_AM_LB    20_34     asian_male       la_belt              183186       0.457
## 6 20_AM_NC    20_34     asian_male       north_coast          254296       0.634

96 Strata by Size

  • some bins are real small
ggplot(strata_relative_size, aes(population)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

96 Strata size relative to population

ggplot(strata_relative_size, aes(x = percent_pop, fill = race_sex)) +
  geom_histogram(alpha=0.5, position="identity") 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Viewing Race & Sex by Population in each strata

  • this is broken down by age
p1 <- ggplot(strata_relative_size, aes(x = population, color =race_sex)) +
  geom_freqpoly() +
  scale_x_log10(breaks = c(50, 150, 400, 750) * 1000)
  
p2 <- ggplot(strata_relative_size, aes(x = population, color =race_sex, fill = race_sex)) +
  geom_density(alpha = .15) +
  scale_x_log10(breaks = c(50, 150, 400, 750) * 1000)

gridExtra::grid.arrange(p1, p2, nrow = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

96 Strata by Geographic Quandrant

  • some populations are pretty small
##Boxplot by California geography
ggplot(data = strata_relative_size, mapping = aes(x = quadrant, y = population)) +
  geom_boxplot(alpha = 0) +
    geom_jitter(alpha = 0.3, aes(color = race_sex))

Comparing Age Ranges and Race by Sex

#Boxplot by California age ranges
ggplot(data = strata_relative_size, mapping = aes(x = age_range, y = population)) +
  geom_boxplot(alpha = 0) +
    geom_jitter(alpha = 0.3, aes(color = race_sex))

Comparing LA Belt, 96 strata

NOTE: Why are "Whiteother Male and White Other Female showing up multiple times?? * oh!!! age group!

la_belt_plot1 <- strata_relative_size %>% 
  select(-age_range, -ID) %>% 
  filter(quadrant == "la_belt") %>% 
ggplot(mapping = aes(x = quadrant, y = population)) +
  geom_boxplot(alpha = 0) +
    geom_jitter(alpha = 0.3, aes(color = race_sex)) +
  #geom_text(aes(label=race_sex),hjust=0, vjust=0) +
   geom_label_repel(aes(label = race_sex),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_classic()

la_belt_plot1

North Coast Strata

  • I know this is ugly but can’t think of what else would be useful at the moment
north_coast_plot1 <- strata_relative_size %>% 
  filter(quadrant == "north_coast") %>% 
ggplot(mapping = aes(x = quadrant, y = population)) +
  geom_boxplot(alpha = 0) +
    geom_jitter(alpha = 0.3, aes(color = race_sex)) +
  #geom_text(aes(label=race_sex),hjust=0, vjust=0) +
   geom_label_repel(aes(label = race_sex),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_classic()

north_coast_plot1

Comparing Strata of Southern Counties

southern_counties_plot1 <- strata_relative_size %>% 
  filter(quadrant == "southern_counties") %>% 
ggplot(mapping = aes(x = quadrant, y = population)) +
  geom_boxplot(alpha = 0) +
    geom_jitter(alpha = 0.3, aes(color = race_sex)) +
  #geom_text(aes(label=race_sex),hjust=0, vjust=0) +
   geom_label_repel(aes(label = race_sex),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_classic()

southern_counties_plot1

Comparing Strata of Inland North

inland_north_plot1 <- strata_relative_size %>% 
  filter(quadrant == "inland_north") %>% 
ggplot(mapping = aes(x = quadrant, y = population)) +
  geom_boxplot(alpha = 0) +
    geom_jitter(alpha = 0.3, aes(color = race_sex)) +
  #geom_text(aes(label=race_sex),hjust=0, vjust=0) +
   geom_label_repel(aes(label = race_sex),
                  box.padding   = 0.35, 
                  point.padding = 0.5,
                  segment.color = 'grey50') +
  theme_classic()

inland_north_plot1