Descriptive States

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv")
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
table1 <- data %>%
  mutate(outside_top_100=ifelse(is.na(avg_eal_tercile),"outside_top_100","within_top_100")) %>%
  group_by(archive_version_year,outside_top_100) %>%
  summarise(rows=sum(rows,na.rm = TRUE)) %>%
  pivot_wider(id_cols = archive_version_year,names_from = outside_top_100,values_from = rows) %>%
  mutate(outside_prop=outside_top_100/(outside_top_100+within_top_100),
         inside_prop=within_top_100/(outside_top_100+within_top_100))
## `summarise()` has grouped output by 'archive_version_year'. You can override
## using the `.groups` argument.
kable(table1,caption="Share of establishments in/out top 100 MSAs")
Share of establishments in/out top 100 MSAs
archive_version_year outside_top_100 within_top_100 outside_prop inside_prop
2013 5516520 10437838 0.3457688 0.6542312
2014 5573021 10343882 0.3501322 0.6498678
2015 5463901 10165015 0.3496020 0.6503980
2016 5423220 10293413 0.3450625 0.6549375
2017 5124766 9702386 0.3456339 0.6543661
2018 5050960 9648466 0.3436161 0.6563839
2019 5151613 10345141 0.3324317 0.6675683
2020 5139349 10385438 0.3310415 0.6689585
2021 5247277 10705948 0.3289164 0.6710836
2022 5472590 11401758 0.3243142 0.6756858
moves <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/migration_map.csv") %>%
  mutate(old_outside_top_100=ifelse(is.na(avg_eal_tercile_old),"outside_top_100","within_top_100")) %>%
  mutate(new_outside_top_100=ifelse(is.na(avg_eal_tercile_new),"outside_top_100","within_top_100")) %>%
  mutate(type=paste0(old_outside_top_100,"_to_",new_outside_top_100)) %>%
  group_by(archive_version_year,type) %>%
  summarise(moves=sum(row_count,na.rm = TRUE)) %>%
  pivot_wider(id_cols = archive_version_year,names_from = type,values_from = moves) %>%
  rowwise() %>%
  mutate(total = sum(c_across(-1), na.rm = TRUE))
## Rows: 63970 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): fips_code, lead_county, avg_eal_tercile_old, NAME_old, avg_eal_ter...
## dbl (12): archive_version_year, row_count, CBSAFP_old, CBSAFP_new, avg_eal_o...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'archive_version_year'. You can override using the `.groups` argument.
kable(moves,caption="Moves in.out top 100 MSAs")
Moves in.out top 100 MSAs
archive_version_year outside_top_100_to_outside_top_100 outside_top_100_to_within_top_100 within_top_100_to_outside_top_100 within_top_100_to_within_top_100 total
2012 3317 2218 2054 3635 7907
2013 3422 2435 2252 3963 8650
2014 2876 1899 1868 3078 6845
2015 2415 1740 1549 2642 5931
2016 3372 2437 2190 3948 8575
2017 2904 2411 2248 3944 8603
2018 4045 3731 3342 6559 13632
2019 3412 2721 2812 4895 10428
2020 2525 2143 2195 4230 8568
2021 2953 2672 2881 5778 11331
moves_prop <- moves %>%
  mutate(prop_outside_top_100_to_outside_top_100=outside_top_100_to_outside_top_100/total,
         prop_outside_top_100_to_within_top_100=outside_top_100_to_within_top_100/total,
         prop_within_top_100_to_outside_top_100=within_top_100_to_outside_top_100/total,
         prop_within_top_100_to_within_top_100=within_top_100_to_within_top_100/total) %>%
    select(-c(2:5))

kable(moves,caption="Shares of moves in.out top 100 MSAs")
Shares of moves in.out top 100 MSAs
archive_version_year outside_top_100_to_outside_top_100 outside_top_100_to_within_top_100 within_top_100_to_outside_top_100 within_top_100_to_within_top_100 total
2012 3317 2218 2054 3635 7907
2013 3422 2435 2252 3963 8650
2014 2876 1899 1868 3078 6845
2015 2415 1740 1549 2642 5931
2016 3372 2437 2190 3948 8575
2017 2904 2411 2248 3944 8603
2018 4045 3731 3342 6559 13632
2019 3412 2721 2812 4895 10428
2020 2525 2143 2195 4230 8568
2021 2953 2672 2881 5778 11331

Census

3 is highest risk, 2 is moderate, 1 is low

table <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/census.csv")
## Rows: 4 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): avg_eal_tercile
## dbl (18): asian, avghhsize, bacplus, black, child, density, hispanic, medgro...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
table2 <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/industry.csv")
## Rows: 4 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): avg_eal_tercile
## dbl (14): FIRE, agriculutre_forestry_etc, armed_forces, arts_ent_rec_etc, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(table,caption="Census Variables by Risk Level")
Census Variables by Risk Level
avg_eal_tercile asian avghhsize bacplus black child density hispanic medgrossrents medhhinc medianhhval other owner_occupied poverty senior total unemp vacant white
1 0.0315712 2.492941 0.3567584 0.1567143 0.2055315 345.1562 0.1144124 1042.8824 68181.35 219141.2 0.0381176 0.6606767 0.1227416 0.1556827 1494376.3 0.0503188 0.0879593 0.6591846
2 0.0374453 2.589091 0.3574478 0.1343262 0.2111732 604.2491 0.1572416 1089.0303 71106.09 240212.1 0.0427232 0.6623240 0.1234514 0.1502595 2060857.9 0.0543446 0.0858594 0.6282638
3 0.0889935 2.749091 0.3387351 0.0946081 0.2056592 828.4572 0.2888781 1412.9091 75917.24 387818.2 0.0494946 0.6279769 0.1258272 0.1651189 3149811.3 0.0588635 0.1053852 0.4780258
Outside 0.0150771 2.527414 0.2405747 0.0833987 0.2033337 115.2304 0.1335524 854.9571 56550.76 176321.7 0.0440430 0.6892702 0.1554496 0.1803648 109954.8 0.0553571 0.1511438 0.7239288
kable(table2,caption="Census Industry by Risk Level")
Census Industry by Risk Level
avg_eal_tercile FIRE agriculutre_forestry_etc armed_forces arts_ent_rec_etc construction education_healthcare_etc information manufacturing other professional_science_managment_etc public_admin retail transportation_warehouse_utilities wholesale
1 0.0722513 0.0089891 0.0099165 0.0835972 0.0632696 0.2451436 0.0180693 0.0957320 0.0461523 0.1176257 0.0523191 0.1067635 0.0549619 0.0252088
2 0.0703715 0.0083459 0.0074881 0.0846070 0.0643641 0.2331948 0.0176900 0.1133960 0.0466104 0.1192654 0.0454758 0.1095589 0.0535857 0.0260463
3 0.0670700 0.0173628 0.0086841 0.1020863 0.0728896 0.2182436 0.0198586 0.0764031 0.0474106 0.1310278 0.0458772 0.1119322 0.0562385 0.0249155
Outside 0.0444278 0.0391243 0.0089660 0.0892926 0.0697175 0.2388585 0.0121106 0.1328069 0.0469522 0.0739920 0.0527425 0.1165147 0.0529105 0.0215839

T-tests 3 is highest risk, 2 is moderate, 1 is low

library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
table <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/census_cbsas.csv")
## Rows: 939 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): avg_eal_tercile
## dbl (18): asian, avghhsize, black, hispanic, medgrossrents, medhhinc, median...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names <- names(table)[2:19]
table_tidy <- table %>% pivot_longer(!avg_eal_tercile)
for (i in names) {
  print(i)
  x <- table_tidy %>%
    filter(name==i) %>%
    mutate(avg_eal_tercile=as.character(avg_eal_tercile)) %>%
    select(-name)
  pairwise_result <- x %>% pairwise_t_test(value~avg_eal_tercile, p.adjust.method = "bonferroni") %>%
    mutate(variable=i) %>%
    select(variable, everything()) %>%
    select(-.y.)
  print(pairwise_result)
}
## [1] "asian"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 asian    1      2          34    33 3.9 e- 1 ns       1   e+ 0 ns          
## 2 asian    1      3          34    33 1.55e-16 ****     9.31e-16 ****        
## 3 asian    2      3          33    33 1.57e-13 ****     9.45e-13 ****        
## 4 asian    1      Outside    34   839 7.73e- 4 ***      4.64e- 3 **          
## 5 asian    2      Outside    33   839 7.32e- 6 ****     4.39e- 5 ****        
## 6 asian    3      Outside    33   839 3.22e-45 ****     1.93e-44 ****        
## [1] "avghhsize"
## # A tibble: 6 × 9
##   variable  group1 group2     n1    n2           p p.signif   p.adj p.adj.signif
##   <chr>     <chr>  <chr>   <int> <int>       <dbl> <chr>      <dbl> <chr>       
## 1 avghhsize 1      2          34    33     7.5 e-2 ns       4.5 e-1 ns          
## 2 avghhsize 1      3          34    33     2.37e-6 ****     1.42e-5 ****        
## 3 avghhsize 2      3          33    33     3.32e-3 **       1.99e-2 *           
## 4 avghhsize 1      Outside    34   839     3.72e-1 ns       1   e+0 ns          
## 5 avghhsize 2      Outside    33   839     1.16e-1 ns       6.94e-1 ns          
## 6 avghhsize 3      Outside    33   839     2.03e-8 ****     1.22e-7 ****        
## [1] "black"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif   p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>      <dbl> <chr>       
## 1 black    1      2          34    33 0.462    ns       1       ns          
## 2 black    1      3          34    33 0.0416   *        0.25    ns          
## 3 black    2      3          33    33 0.196    ns       1       ns          
## 4 black    1      Outside    34   839 0.000798 ***      0.00479 **          
## 5 black    2      Outside    33   839 0.0214   *        0.129   ns          
## 6 black    3      Outside    33   839 0.612    ns       1       ns          
## [1] "hispanic"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2         p p.signif     p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>     <dbl> <chr>        <dbl> <chr>       
## 1 hispanic 1      2          34    33 0.354     ns       1         ns          
## 2 hispanic 1      3          34    33 0.00017   ***      0.00102   **          
## 3 hispanic 2      3          33    33 0.00479   **       0.0287    *           
## 4 hispanic 1      Outside    34   839 0.563     ns       1         ns          
## 5 hispanic 2      Outside    33   839 0.48      ns       1         ns          
## 6 hispanic 3      Outside    33   839 0.0000042 ****     0.0000252 ****        
## [1] "medgrossrents"
## # A tibble: 6 × 9
##   variable     group1 group2    n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>        <chr>  <chr>  <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 medgrossren… 1      2         34    33 3.75e- 1 ns       1   e+ 0 ns          
## 2 medgrossren… 1      3         34    33 2.24e-12 ****     1.34e-11 ****        
## 3 medgrossren… 2      3         33    33 9.51e-10 ****     5.71e- 9 ****        
## 4 medgrossren… 1      Outsi…    34   839 5.40e- 7 ****     3.24e- 6 ****        
## 5 medgrossren… 2      Outsi…    33   839 8.64e-10 ****     5.18e- 9 ****        
## 6 medgrossren… 3      Outsi…    33   839 1.57e-44 ****     9.40e-44 ****        
## [1] "medhhinc"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 medhhinc 1      2          34    33 3.35e- 1 ns       1   e+ 0 ns          
## 2 medhhinc 1      3          34    33 1.09e- 2 *        6.54e- 2 ns          
## 3 medhhinc 2      3          33    33 1.16e- 1 ns       6.94e- 1 ns          
## 4 medhhinc 1      Outside    34   839 1.06e- 7 ****     6.39e- 7 ****        
## 5 medhhinc 2      Outside    33   839 6.5 e-11 ****     3.90e-10 ****        
## 6 medhhinc 3      Outside    33   839 6.88e-18 ****     4.13e-17 ****        
## [1] "medianhhval"
## # A tibble: 6 × 9
##   variable    group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>       <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 medianhhval 1      2          34    33 4.07e- 1 ns       1   e+ 0 ns          
## 2 medianhhval 1      3          34    33 5.51e-11 ****     3.31e-10 ****        
## 3 medianhhval 2      3          33    33 1.12e- 8 ****     6.74e- 8 ****        
## 4 medianhhval 1      Outside    34   839 1.89e- 2 *        1.13e- 1 ns          
## 5 medianhhval 2      Outside    33   839 5.64e- 4 ***      3.38e- 3 **          
## 6 medianhhval 3      Outside    33   839 1.61e-28 ****     9.68e-28 ****        
## [1] "owner_occupied"
## # A tibble: 6 × 9
##   variable       group1 group2    n1    n2       p p.signif   p.adj p.adj.signif
##   <chr>          <chr>  <chr>  <int> <int>   <dbl> <chr>      <dbl> <chr>       
## 1 owner_occupied 1      2         34    33 9.14e-1 ns       1   e+0 ns          
## 2 owner_occupied 1      3         34    33 3.16e-2 *        1.89e-1 ns          
## 3 owner_occupied 2      3         33    33 2.5 e-2 *        1.5 e-1 ns          
## 4 owner_occupied 1      Outsi…    34   839 8.68e-3 **       5.21e-2 ns          
## 5 owner_occupied 2      Outsi…    33   839 1.48e-2 *        8.85e-2 ns          
## 6 owner_occupied 3      Outsi…    33   839 3.58e-8 ****     2.15e-7 ****        
## [1] "poverty"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2       p p.signif  p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>   <dbl> <chr>     <dbl> <chr>       
## 1 poverty  1      2          34    33 0.963   ns       1      ns          
## 2 poverty  1      3          34    33 0.839   ns       1      ns          
## 3 poverty  2      3          33    33 0.877   ns       1      ns          
## 4 poverty  1      Outside    34   839 0.00275 **       0.0165 *           
## 5 poverty  2      Outside    33   839 0.00388 **       0.0233 *           
## 6 poverty  3      Outside    33   839 0.00749 **       0.0449 *           
## [1] "total"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 total    1      2          34    33 8.36e- 3 **       5.01e- 2 ns          
## 2 total    1      3          34    33 2.91e-14 ****     1.74e-13 ****        
## 3 total    2      3          33    33 5.50e- 7 ****     3.30e- 6 ****        
## 4 total    1      Outside    34   839 1.02e-18 ****     6.14e-18 ****        
## 5 total    2      Outside    33   839 2.02e-33 ****     1.21e-32 ****        
## 6 total    3      Outside    33   839 1.69e-71 ****     1.01e-70 ****        
## [1] "unemp"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2     p p.signif p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 unemp    1      2          34    33 0.444 ns       1     ns          
## 2 unemp    1      3          34    33 0.104 ns       0.626 ns          
## 3 unemp    2      3          33    33 0.394 ns       1     ns          
## 4 unemp    1      Outside    34   839 0.181 ns       1     ns          
## 5 unemp    2      Outside    33   839 0.791 ns       1     ns          
## 6 unemp    3      Outside    33   839 0.358 ns       1     ns          
## [1] "vacant"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2           p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>       <dbl> <chr>       <dbl> <chr>       
## 1 vacant   1      2          34    33 0.907       ns        1   e+0 ns          
## 2 vacant   1      3          34    33 0.333       ns        1   e+0 ns          
## 3 vacant   2      3          33    33 0.282       ns        1   e+0 ns          
## 4 vacant   1      Outside    34   839 0.00000109  ****      6.56e-6 ****        
## 5 vacant   2      Outside    33   839 0.000000693 ****      4.16e-6 ****        
## 6 vacant   3      Outside    33   839 0.000482    ***       2.89e-3 **          
## [1] "white"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 white    1      2          34    33 5.38e- 1 ns       1   e+ 0 ns          
## 2 white    1      3          34    33 3.22e- 4 ***      1.93e- 3 **          
## 3 white    2      3          33    33 3.04e- 3 **       1.82e- 2 *           
## 4 white    1      Outside    34   839 7.18e- 2 ns       4.31e- 1 ns          
## 5 white    2      Outside    33   839 8.8 e- 3 **       5.28e- 2 ns          
## 6 white    3      Outside    33   839 2.63e-11 ****     1.58e-10 ****        
## [1] "other"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2     p p.signif p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 other    1      2          34    33 0.721 ns           1 ns          
## 2 other    1      3          34    33 0.377 ns           1 ns          
## 3 other    2      3          33    33 0.602 ns           1 ns          
## 4 other    1      Outside    34   839 0.52  ns           1 ns          
## 5 other    2      Outside    33   839 0.888 ns           1 ns          
## 6 other    3      Outside    33   839 0.56  ns           1 ns          
## [1] "density"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 density  1      2          34    33 9.40e- 8 ****     5.64e- 7 ****        
## 2 density  1      3          34    33 1.4 e-22 ****     8.39e-22 ****        
## 3 density  2      3          33    33 4.34e- 6 ****     2.61e- 5 ****        
## 4 density  1      Outside    34   839 4.39e-11 ****     2.63e-10 ****        
## 5 density  2      Outside    33   839 1.74e-40 ****     1.05e-39 ****        
## 6 density  3      Outside    33   839 9.06e-77 ****     5.43e-76 ****        
## [1] "child"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2     p p.signif p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 child    1      2          34    33 0.433 ns       1     ns          
## 2 child    1      3          34    33 0.986 ns       1     ns          
## 3 child    2      3          33    33 0.447 ns       1     ns          
## 4 child    1      Outside    34   839 0.67  ns       1     ns          
## 5 child    2      Outside    33   839 0.134 ns       0.804 ns          
## 6 child    3      Outside    33   839 0.657 ns       1     ns          
## [1] "senior"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2         p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>     <dbl> <chr>       <dbl> <chr>       
## 1 senior   1      2          34    33 0.584     ns       1        ns          
## 2 senior   1      3          34    33 0.34      ns       1        ns          
## 3 senior   2      3          33    33 0.136     ns       0.817    ns          
## 4 senior   1      Outside    34   839 0.000513  ***      0.00308  **          
## 5 senior   2      Outside    33   839 0.0000303 ****     0.000182 ***         
## 6 senior   3      Outside    33   839 0.034     *        0.204    ns          
## [1] "bacplus"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 bacplus  1      2          34    33 9.74e- 1 ns       1   e+ 0 ns          
## 2 bacplus  1      3          34    33 3.89e- 1 ns       1   e+ 0 ns          
## 3 bacplus  2      3          33    33 3.74e- 1 ns       1   e+ 0 ns          
## 4 bacplus  1      Outside    34   839 2.09e-14 ****     1.25e-13 ****        
## 5 bacplus  2      Outside    33   839 3.39e-14 ****     2.03e-13 ****        
## 6 bacplus  3      Outside    33   839 1.58e-10 ****     9.5 e-10 ****

Line Graphs

Number of Business & Mean Establishment Year by risk level over year

Original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  filter(!is.na(avg_eal))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,rows,color=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Number of Businesses")
# ggsave("figures/number_of_businesses.png")
plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,mean_year_established,color=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Mean year established")
# ggsave("figures/mean_year_established.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

New

library(ggplot2)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  filter(!is.na(avg_eal))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile,
                               levels = c("High", "Moderate", "Low"),
                               labels = c("High Risk", "Moderate Risk", "Low Risk"))

### Plot 1: Number of Businesses

medians_business <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_business = median(rows, na.rm = TRUE), .groups = "drop")

# interactive plot (all individual lines in light gray plus the median line)
plot1_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = rows, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_business, aes(x = archive_version_year, y = median_business),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Number of Businesses",
       title = "Number of Businesses by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot1_interactive_plotly <- ggplotly(plot1_interactive)


# static plot with only the median line
plot1_static <- ggplot(medians_business, aes(x = archive_version_year, y = median_business)) +
  geom_line(data = medians_business, aes(x = archive_version_year, y = median_business),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Median Number of Businesses",
       title = "Median Number of Businesses by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# ggsave("figures/number_of_businesses_median.png", plot = plot1_static)

plot1_static_bar <- ggplot(medians_business, aes(x = archive_version_year, y = median_business)) +
  geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = labeller(avg_eal_tercile = c("High" = "High Risk", "Moderate" = "Moderate Risk", "Low" = "Low Risk"))) +
  labs(x = "Year", y = "Median Number of Businesses",
       title = "Median Number of Businesses by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### Plot 2: Mean Year Established

medians_year <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_year = median(mean_year_established, na.rm = TRUE), .groups = "drop")

# interactive plot (all individual lines in light gray plus the median line)
plot2_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = mean_year_established, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_year, aes(x = archive_version_year, y = median_year),
            color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Mean Year Established",
       title = "Mean Year Established by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot2_interactive_plotly <- ggplotly(plot2_interactive)

# static plot with only the median line for Mean Year Established
plot2_static <- ggplot(medians_year, aes(x = archive_version_year, y = median_year)) +
  geom_line(color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Median Mean Year Established",
       title = "Median Mean Year Established by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# ggsave("figures/mean_year_established_median.png", plot = plot2_static)

plot1_interactive_plotly # interactive plot 1 will all + median
plot1_static # static plot 1 median only

plot1_interactive # static plot 1 with all + median

plot2_interactive_plotly # interactive plot 2 will all + median
plot2_static # static plot 2 with median only

plot2_interactive # static plot 2 with all + median
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

Business Events by risk level over year

Origianl

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Births")
# ggsave("figures/births.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Deaths")
# ggsave("figures/deaths.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Entries")
# ggsave("figures/entries.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =)+
  ggtitle("Exits")
# ggsave("figures/exits.png")

ggplotly(plot1) 
ggplotly(plot2) 
ggplotly(plot3) 
ggplotly(plot4) 

New

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
risk_labeller <- labeller(avg_eal_tercile = c("High" = "High Risk",
                                                "Moderate" = "Moderate Risk",
                                                "Low" = "Low Risk"))

### 1. Births

medians_births <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")

# Interactive plot: all individual lines (light gray) with median line (blue)
plot_births_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = births, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_births, aes(x = archive_version_year, y = median_births),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Births",
       title = "Business Births by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_births_interactive_plotly <- ggplotly(plot_births_interactive)

# Static plot: only the median line
plot_births_static <- ggplot(medians_births, aes(x = archive_version_year, y = median_births)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Births",
       title = "Median Business Births by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 2. Deaths

medians_deaths <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_deaths = median(deaths, na.rm = TRUE), .groups = "drop")

plot_deaths_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = deaths, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_deaths, aes(x = archive_version_year, y = median_deaths),
            color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Deaths",
       title = "Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_deaths_interactive_plotly <- ggplotly(plot_deaths_interactive)

plot_deaths_static <- ggplot(medians_deaths, aes(x = archive_version_year, y = median_deaths)) +
  geom_line(color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Deaths",
       title = "Median Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 3. Entries

medians_entries <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_entries = median(entry, na.rm = TRUE), .groups = "drop")

plot_entries_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = entry, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_entries, aes(x = archive_version_year, y = median_entries),
            color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Entries",
       title = "Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_entries_interactive_plotly <- ggplotly(plot_entries_interactive)

plot_entries_static <- ggplot(medians_entries, aes(x = archive_version_year, y = median_entries)) +
  geom_line(color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Entries",
       title = "Median Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 4. Exits

medians_exits <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_exits = median(exit, na.rm = TRUE), .groups = "drop")

plot_exits_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = exit, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_exits, aes(x = archive_version_year, y = median_exits),
            color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Exits",
       title = "Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_exits_interactive_plotly <- ggplotly(plot_exits_interactive)

plot_exits_static <- ggplot(medians_exits, aes(x = archive_version_year, y = median_exits)) +
  geom_line(color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Exits",
       title = "Median Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# interactive with all + median
plot_births_interactive_plotly
plot_deaths_interactive_plotly
plot_entries_interactive_plotly
plot_exits_interactive_plotly
# static with median
plot_births_static

plot_deaths_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_static

plot_exits_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# static with all + median
plot_births_interactive

plot_deaths_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_interactive

plot_exits_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

number of critical and non critical business’s business events by risk level over year 

original

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all.csv")
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

births <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_births.csv") %>%
  filter(archive_version_year != 2012)
births$avg_eal_tercile <- factor(births$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
births$pop_tercile <- factor(births$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

entries <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_entries.csv")%>%
  filter(archive_version_year != 2012)
entries$avg_eal_tercile <- factor(entries$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
entries$pop_tercile <- factor(entries$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

deaths <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_deaths.csv")%>%
  filter(archive_version_year != 2022)
deaths$avg_eal_tercile <- factor(deaths$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
deaths$pop_tercile <- factor(deaths$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

exits <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_exits.csv")%>%
  filter(archive_version_year != 2022)
exits$avg_eal_tercile <- factor(exits$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
exits$pop_tercile <- factor(exits$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))


plot1 <- all %>%
  ggplot()+
  geom_line(aes(archive_version_year,row_count,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by population tercile, all")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/critical_by_pop.png")

plot2 <- births %>%ggplot()+
  geom_line(aes(archive_version_year,births,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, births")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_births.png")


plot3 <- deaths %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, deaths")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_deaths.png")

plot4 <- entries %>% ggplot()+
  geom_line(aes(archive_version_year,entries,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, entries")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_entries.png")

plot5 <- exits %>% ggplot()+
  geom_line(aes(archive_version_year,exits,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, exits")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_exits.png")

plot1

plot2

plot3

plot4

plot5

new (just overall for now, i will add each business events)

lineplot version

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all.csv") %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))
## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

all <- all %>%
  mutate(critical = recode(critical,
                           "Not" = "Non-Critical",
                           "Critical" = "Critical"))

plot1 <- all %>%
  ggplot(aes(x = archive_version_year, y = row_count, color = critical)) +
  geom_line(size = 0.6, alpha = 0.8) +
  facet_grid(pop_tercile ~ avg_eal_tercile,
             labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
                                                     "Moderate" = "Moderate Risk",
                                                     "Low" = "Low Risk"))) +
  labs(x = "Year",
       y = "Number of Business",
       title = "Business Counts by Population Tercile and Risk Level",
       color = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1

stacked barplot version

all <- all %>%
  mutate(avg_eal_tercile = factor(avg_eal_tercile, levels = c("High", "Moderate", "Low"),
                                  labels = c("High Risk", "Moderate Risk", "Low Risk")),
         pop_tercile = factor(pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low")))

plot1_stk <- all %>%
  ggplot(aes(x = archive_version_year, y = row_count, fill = critical)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year",
       y = "Number of Business",
       title = "Business Counts by Population Tercile and Risk Level",
       fill = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1_stk

Number of Business & Mean Year Established by risk level and population level over year

original 

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,rows,color=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Number of Businesses")
# ggsave("figures/number_of_businesses.png")
plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,mean_year_established,color=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Mean year established")
# ggsave("figures/mean_year_established.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>% 
  mutate(CBSAFP = as.character(CBSAFP)) %>% 
  filter(!is.na(avg_eal)) %>% 
  left_join(top100_cbsa, by = c("CBSAFP" = "GEOID")) %>%
  filter(!is.na(pop_tercile))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, 
                               levels = c("High", "Moderate", "Low"),
                               labels = c("High Risk", "Moderate Risk", "Low Risk"))
data$pop_tercile <- factor(data$pop_tercile, 
                           levels = c("Pop: Low", "Pop: Moderate", "Pop: High"),
                           labels = c("Low Population", "Moderate Population", "High Population"))

### Plot 1: Number of Businesses

medians_business <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_rows = median(rows, na.rm = TRUE), .groups = "drop")

# Combined Plot: All individual CBSA lines (thin gray) + median line (blue)
plot1_combined <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = rows, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_business, aes(x = archive_version_year, y = median_rows),
            color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Number of Businesses", 
       title = "Number of Businesses by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# Static Plot: Only the median line for Number of Businesses
plot1_median <- ggplot(medians_business, aes(x = archive_version_year, y = median_rows)) +
  geom_line(color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Median Number of Businesses",
       title = "Median Number of Businesses by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### Plot 2: Mean Year Established

medians_year <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_year = median(mean_year_established, na.rm = TRUE), .groups = "drop")

# Combined Plot: All individual CBSA lines (thin gray) + median line (red)
plot2_combined <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = mean_year_established, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_year, aes(x = archive_version_year, y = median_year),
            color = "red", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Mean Year Established", 
       title = "Mean Year Established by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# Static Plot: Only the median line for Mean Year Established
plot2_median <- ggplot(medians_year, aes(x = archive_version_year, y = median_year)) +
  geom_line(color = "red", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Median Mean Year Established",
       title = "Median Mean Year Established by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# combined plots
plot1_combined

plot2_combined
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

# static median-only plots
plot1_median

plot2_median

# interactive version
# interactive_plot1 <- ggplotly(plot1_combined)
# interactive_plot2 <- ggplotly(plot2_combined)

Number of Business Involved in Each Business Events

Old

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")%>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Births")
# ggsave("figures/births.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Deaths")
# ggsave("figures/deaths.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Entries")
# ggsave("figures/entries.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Exits")
# ggsave("figures/exits.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot3) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot4) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

(will work on this)

Line Employees

(Need to understand what this is doing)

I dont know if this is business count because these look different from the business counts by risk level by population level from before

original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Births")
# ggsave("figures/births_emp.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Deaths")
# ggsave("figures/deaths_emp.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Entries - # of employees")
# ggsave("figures/entries_emp.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =)+
  ggtitle("Exits")
# ggsave("figures/exits_emp.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot3) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot4) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
risk_labeller <- labeller(avg_eal_tercile = c("High" = "High Risk",
                                                "Moderate" = "Moderate Risk",
                                                "Low" = "Low Risk"))

### 1. Births

medians_births <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")

# Interactive plot: all individual lines (light gray) with median line (blue)
plot_births_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = births, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_births, aes(x = archive_version_year, y = median_births),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Employees",
       title = "Employee Size Involved in Business Births by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_births_interactive_plotly <- ggplotly(plot_births_interactive)

# Static plot: only the median line
plot_births_static <- ggplot(medians_births, aes(x = archive_version_year, y = median_births)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Employee Size",
       title = "Median Employee Size by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 2. Deaths

medians_deaths <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_deaths = median(deaths, na.rm = TRUE), .groups = "drop")

plot_deaths_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = deaths, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_deaths, aes(x = archive_version_year, y = median_deaths),
            color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Deaths",
       title = "Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# plot_deaths_interactive_plotly <- ggplotly(plot_deaths_interactive)

plot_deaths_static <- ggplot(medians_deaths, aes(x = archive_version_year, y = median_deaths)) +
  geom_line(color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Deaths",
       title = "Median Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 3. Entries

medians_entries <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_entries = median(entry, na.rm = TRUE), .groups = "drop")

plot_entries_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = entry, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_entries, aes(x = archive_version_year, y = median_entries),
            color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Entries",
       title = "Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# plot_entries_interactive_plotly <- ggplotly(plot_entries_interactive)

plot_entries_static <- ggplot(medians_entries, aes(x = archive_version_year, y = median_entries)) +
  geom_line(color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Entries",
       title = "Median Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 4. Exits

medians_exits <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_exits = median(exit, na.rm = TRUE), .groups = "drop")

plot_exits_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = exit, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_exits, aes(x = archive_version_year, y = median_exits),
            color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Exits",
       title = "Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# plot_exits_interactive_plotly <- ggplotly(plot_exits_interactive)

plot_exits_static <- ggplot(medians_exits, aes(x = archive_version_year, y = median_exits)) +
  geom_line(color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Exits",
       title = "Median Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# interactive with all + median:
# plot_births_interactive_plotly
# plot_deaths_interactive_plotly
# plot_entries_interactive_plotly
# plot_exits_interactive_plotly

# static with median only
plot_births_static

plot_deaths_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_static

plot_exits_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# static with all + median
plot_births_interactive

plot_deaths_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_interactive

plot_exits_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

number of employees involved in ciritcal and non critical business’s business events by risk level over year (not really)

original

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv")
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

births <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_births_employees.csv") %>%
  filter(archive_version_year != 2012)
births$avg_eal_tercile <- factor(births$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
births$pop_tercile <- factor(births$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

entries <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_entries_employees.csv")%>%
  filter(archive_version_year != 2012)
entries$avg_eal_tercile <- factor(entries$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
entries$pop_tercile <- factor(entries$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

deaths <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_deaths_employees.csv")%>%
  filter(archive_version_year != 2022)
deaths$avg_eal_tercile <- factor(deaths$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
deaths$pop_tercile <- factor(deaths$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

exits <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_exits_employees.csv")%>%
  filter(archive_version_year != 2022)
exits$avg_eal_tercile <- factor(exits$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
exits$pop_tercile <- factor(exits$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))


plot1 <- all %>%
  ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by population tercile, all - employees")
# ggsave("figures/critical_by_pop_emp.png")

plot2 <- births %>%ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, births, births - employees")
# ggsave("figures/naics_by_tercile_births_emp.png")


plot3 <- deaths %>% ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, deaths - employees")
# ggsave("figures/naics_by_tercile_deaths_emp.png")

plot4 <- entries %>% ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, entries - employees")
# ggsave("figures/naics_by_tercile_entries_emp.png")

plot5 <- exits %>% ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, exits - employees")
# ggsave("figures/naics_by_tercile_exit_emps.png")

plot1

plot2

plot3

plot4

plot5

new

lineplot

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv") %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))
## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, employees
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 2. Factor levels for risk and population terciles
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

# 3. Recode "Not" to "Non-Critical" for the legend
all <- all %>%
  mutate(critical = recode(critical,
                           "Not" = "Non-Critical",
                           "Critical" = "Critical"))

# Create the plot
plot1 <- all %>%
    ggplot(aes(x = archive_version_year, y = employees, color = critical)) +
  # Thinner line
  geom_line(size = 0.6, alpha = 0.8) +
  # Facet by population tercile (rows) and risk level (columns)
  facet_grid(pop_tercile ~ avg_eal_tercile,
             labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
                                                     "Moderate" = "Moderate Risk",
                                                     "Low" = "Low Risk"))) +
  labs(x = "Year",
       y = "Employee Size",
       title = "Employee Size by Population Tercile and Risk Level",
       color = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1

stackedbar plot

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv") %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))
## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, employees
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all <- all %>%
  mutate(avg_eal_tercile = factor(avg_eal_tercile, levels = c("High", "Moderate", "Low"),
                                  labels = c("High Risk", "Moderate Risk", "Low Risk")),
         pop_tercile = factor(pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low")))

# Create the stacked bar plot:
plot1_stk <- all %>%
  ggplot(aes(x = archive_version_year, y = employees, fill = critical)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year",
       y = "Employee Size",
       title = "Employee Size by Population Tercile and Risk Level",
       fill = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1_stk

Mean employee size by risk level and population level

original 

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,mean_employees,color=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Mean Employees")
# ggsave("figures/number_of_employees.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

median mean employee size by population level

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
median_meanemployee <- data %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile)) %>%
  group_by(archive_version_year, pop_tercile) %>%
  summarise(median_meanemployee = median(mean_employees, na.rm = TRUE), .groups = "drop")

plot1 <- ggplot(median_meanemployee, aes(x = archive_version_year, y = median_meanemployee)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~pop_tercile, ncol = 3, labeller = labeller(pop_tercile = c("Pop: High" = "High Population", "Pop: Moderate" = "Moderate Population", "Pop: Low" = "Low Population"))) +
  labs(x = "Year", y = "Median Mean Employee Size",
       title = "Median Mean Employee Size by Population Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot1

median mean employee size by risk level

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
median_meanemployee <- data %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile)) %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_meanemployee = median(mean_employees, na.rm = TRUE), .groups = "drop")

plot2 <- ggplot(median_meanemployee, aes(x = archive_version_year, y = median_meanemployee)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
                                                     "Moderate" = "Moderate Risk",
                                                     "Low" = "Low Risk"))) +
  labs(x = "Year", y = "Median Mean Employee Size",
       title = "Median Mean Employee Size by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot2

median mean employee size by risk and population level (all + median)

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>% 
  mutate(CBSAFP = as.character(CBSAFP)) %>% 
  filter(!is.na(avg_eal)) %>% 
  left_join(top100_cbsa, by = c("CBSAFP" = "GEOID")) %>%
  filter(!is.na(pop_tercile))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

medians_employees <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_employees = median(mean_employees, na.rm = TRUE), .groups = "drop")

# Create the combined static plot:
# - Individual CBSA lines in thin gray
# - Overlaid median line in blue
# - Facet by population (rows) and risk (columns) with custom labels
plot1_static <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = mean_employees, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_employees, aes(x = archive_version_year, y = median_employees),
            color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile,
             labeller = labeller(
               pop_tercile = c("Pop: Low" = "Low Population",
                               "Pop: Moderate" = "Moderate Population",
                               "Pop: High" = "High Population"),
               avg_eal_tercile = c("High" = "High Risk",
                                   "Moderate" = "Moderate Risk",
                                   "Low" = "Low Risk")
             )) +
  labs(x = "Year", 
       y = "Mean Employees", 
       title = "Mean Employees by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

plot1_static

Number of Business Events by Population and Risk Level

original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")%>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Births")
# ggsave("figures/births_emp.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Deaths")
# ggsave("figures/deaths_emp.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Entries")
# ggsave("figures/entries_emp.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Exits")
# ggsave("figures/exits_emp.png")

ggplotly(plot1) %>% layout(showlegend = FALSE)
ggplotly(plot2) %>% layout(showlegend = FALSE)
ggplotly(plot3) %>% layout(showlegend = FALSE)
ggplotly(plot4) %>% layout(showlegend = FALSE)

new

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")%>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID")) %>%
  filter(!is.na(pop_tercile))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, 
                               levels = c("High", "Moderate", "Low"),
                               labels = c("High Risk", "Moderate Risk", "Low Risk"))
data$pop_tercile <- factor(data$pop_tercile, 
                           levels = c("Pop: Low", "Pop: Moderate", "Pop: High"),
                           labels = c("Low Population", "Moderate Population", "High Population"))


medians_births <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")

plot_births <- ggplot() +
  geom_line(data = data, 
            aes(x = archive_version_year, y = births, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_births, 
            aes(x = archive_version_year, y = median_births),
            color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", 
       y = "Employee Size",
       title = "Employee Size Involved in Business Births by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# Create interactive plotly version with hover text
# plot_births_interactive <- ggplotly(plot_births, tooltip = "text")

# Display the plots (in your R session, you can view them)
# plot_births_interactive
plot_births

Chord Diagrams

nri_cbsa_all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/nri_cbsa_all.csv") %>%
  mutate(CBSAFP =as.character(CBSAFP))
## Rows: 940 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_all
## dbl (3): CBSAFP, avg_eal_all, tercile_all
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
regions <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/top100_regions.csv") %>%
  mutate(GEOID =as.character(GEOID)) %>%
  select(GEOID, Region)
## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, NAME, Region
## dbl (1): GEOID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top100_cbsa <- get_acs(geography = "cbsa", variables = c(pop="B01001_001"),output = "wide") %>%
  top_n(100,popE) %>%
  mutate(pop_tercile_num = ntile(popE, 3)) %>%
  mutate(pop_tercile=as_factor(pop_tercile_num)) %>%
  mutate(pop_tercile = factor(pop_tercile, levels = 1:3, labels = c("Pop: Low", "Pop: Moderate", "Pop: High"))) %>% 
  select(pop_tercile,pop_tercile_num,popE,GEOID) %>%
  left_join(regions)
## Getting data from the 2019-2023 5-year ACS
## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.
## Joining with `by = join_by(GEOID)`
top100_cbsa_new <- top100_cbsa %>%
  rename_with(~paste0(., "_new"), everything())
top100_cbsa_old <- top100_cbsa %>%
  rename_with(~paste0(., "_old"), everything())

moves <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/migration_map.csv") %>%
  select(CBSAFP_old,CBSAFP_new,NAME_old,NAME_new,row_count,avg_eal_tercile_old,avg_eal_tercile_new) %>%
  mutate(CBSAFP_old=as.character(CBSAFP_old)) %>%
  mutate(CBSAFP_new=as.character(CBSAFP_new)) %>%
  mutate(NAME_old=ifelse(is.na(avg_eal_tercile_old),"Outside",NAME_old)) %>%
  mutate(NAME_new=ifelse(is.na(avg_eal_tercile_new),"Outside",NAME_new)) %>%
  group_by(CBSAFP_old,CBSAFP_new,NAME_old,NAME_new,avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count)) %>%
  left_join(top100_cbsa_old, by=c("CBSAFP_old"="GEOID_old")) %>%
  left_join(top100_cbsa_new, by=c("CBSAFP_new"="GEOID_new")) %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new)) %>%
  mutate(Region_old=ifelse(is.na(Region_old),"Outside",Region_old)) %>%
  mutate(Region_new=ifelse(is.na(Region_new),"Outside",Region_new))
## Rows: 63970 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): fips_code, lead_county, avg_eal_tercile_old, NAME_old, avg_eal_ter...
## dbl (12): archive_version_year, row_count, CBSAFP_old, CBSAFP_new, avg_eal_o...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'CBSAFP_old', 'CBSAFP_new', 'NAME_old', 'NAME_new', 'avg_eal_tercile_old'. You can override using the `.groups` argument.

Moving between risk level

Chord Plot

data <- moves %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption="Risk level to Risk level")
Risk level to Risk level
avg_eal_tercile_old High Low Moderate Outside
High 0.1418442 0.0145098 0.0363402 0.0729597
Low 0.0149617 0.0161366 0.0323389 0.0577516
Moderate 0.0303752 0.0298987 0.0341958 0.0614735
Outside 0.0696075 0.0633796 0.0675453 0.2566818
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Risk level to Risk level", cex.main = 1)

# png(file="figures/risk_to_risk.png")
dev.off()
## null device 
##           1
data <- moves %>%
  filter(Region_old==Region_new) %>%
  filter(Region_old!="Outside") %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Moves within Region")
Moves within Region
avg_eal_tercile_old High Low Moderate
High 0.5021631 0.0183028 0.0656683
Low 0.0214088 0.0354964 0.0905158
Moderate 0.0593086 0.0872989 0.1198373
chordDiagram(
  x=data, directional = 1, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow",
    grid.col = c(High="red",Moderate="yellow", Low="green"))
title("Moves within Region", cex.main = 1)
# png(file="figures/moves_within_region.png")
dev.off()
## null device 
##           1
data <- moves %>%
  filter(Region_old!=Region_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
table <- moves %>%
  filter(Region_old!=Region_new) %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Risk of Destinations outside top100")
table
Risk of Destinations outside top100
avg_eal_tercile_all row_count
High 13209
Low 3834
Moderate 5771
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Moves outside Region")
Moves outside Region
avg_eal_tercile_old High Low Moderate Outside
High 0.0591552 0.0204144 0.0424992 0.1398169
Low 0.0199004 0.0161259 0.0238998 0.1092194
Moderate 0.0336171 0.0204947 0.0147928 0.1173948
Outside 0.1335528 0.1191776 0.1299390 NA
chordDiagram(
  x=data, directional = 1, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow",
    grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Moves outside Region", cex.main = 1)
# png(file="figures/moves_outside_region.png")
dev.off()
## null device 
##           1

Table

moves_table <- moves %>%
  group_by(avg_eal_tercile_old, avg_eal_tercile_new) %>%
  summarise(row_count = sum(row_count, na.rm = TRUE), .groups = "drop") %>%
  # Replace NA with "Outside" for both origin and destination
  mutate(
    avg_eal_tercile_old = ifelse(is.na(avg_eal_tercile_old), "Outside", avg_eal_tercile_old),
    avg_eal_tercile_new = ifelse(is.na(avg_eal_tercile_new), "Outside", avg_eal_tercile_new)
  )

total_moves <- sum(moves_table$row_count, na.rm = TRUE)

moves_table <- moves_table %>%
  mutate(pct = round(row_count / total_moves * 100, 1))

moves_wide <- moves_table %>%
  select(avg_eal_tercile_old, avg_eal_tercile_new, pct) %>%
  pivot_wider(names_from = avg_eal_tercile_new, values_from = pct, values_fill = 0)

moves_wide <- moves_wide %>%
  rename(`Origin Risk Level` = avg_eal_tercile_old)

desired_order <- c("High", "Moderate", "Low", "Outside")

moves_wide$`Origin Risk Level` <- factor(moves_wide$`Origin Risk Level`,
                                         levels = desired_order)
moves_wide <- moves_wide %>%
  select(`Origin Risk Level`, all_of(desired_order)) %>%
  arrange(`Origin Risk Level`)

moves_wide %>%
  kable(
    format = "html",
    digits = 1,
    caption = "Percentage of Business Moves by Risk Level (Origin → Destination) [%]"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Destination Risk Level" = 4))
Percentage of Business Moves by Risk Level (Origin → Destination) [%]
Destination Risk Level
Origin Risk Level High Moderate Low Outside
High 14.2 3.6 1.5 7.3
Moderate 3.0 3.4 3.0 6.1
Low 1.5 3.2 1.6 5.8
Outside 7.0 6.8 6.3 25.7

This chunk produces the chord diagrams for critical services and non-critical services

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_crit.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Critical: Risk level to Risk level")
Critical: Risk level to Risk level
avg_eal_tercile_old Low Moderate High Outside
Low 0.0160731 0.0317129 0.0136222 0.0600542
Moderate 0.0289641 0.0325660 0.0281381 0.0618009
High 0.0120379 0.0328097 0.1314421 0.0702234
Outside 0.0668382 0.0712932 0.0700339 0.2723900
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Critical: Risk level to Risk level", cex.main = 1)

# png(file="figures/critical_risk_to_risk.png")
dev.off()
## null device 
##           1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_non_crit.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Non-Critical: Risk level to Risk level")
Non-Critical: Risk level to Risk level
avg_eal_tercile_old Low Moderate High Outside
Low 0.0162345 0.0333048 0.0170285 0.0541986
Moderate 0.0313408 0.0367105 0.0338271 0.0609682
High 0.0183239 0.0417877 0.1578947 0.0771818
Outside 0.0580431 0.0617622 0.0689497 0.2324440
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Non-Critical: Risk level to Risk level", cex.main = 1)

# png(file="figures/non_critical_risk_to_risk.png")
dev.off()
## null device 
##           1

This chunk produces the chord diagrams for standalone and chains

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_chain.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Chain: Risk level to Risk level")
Chain: Risk level to Risk level
avg_eal_tercile_old Low Moderate High Outside
Low 0.0239119 0.0384862 0.0292970 0.0452547
Moderate 0.0373499 0.0460452 0.0461934 0.0478237
High 0.0328541 0.0570130 0.1558223 0.0509856
Outside 0.0636826 0.0644731 0.0606689 0.2001383
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Chain: Risk level to Risk level", cex.main = 1)

# png(file="figures/chain_risk_to_risk.png")
dev.off()
## null device 
##           1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_standalone.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Standalone: Risk level to Risk level")
Standalone: Risk level to Risk level
avg_eal_tercile_old Low Moderate High Outside
Low 0.0145856 0.0311126 0.0121021 0.0602444
Moderate 0.0284123 0.0318321 0.0272199 0.0641963
High 0.0108505 0.0322164 0.1390559 0.0773431
Outside 0.0633192 0.0681581 0.0713906 0.2679610
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Standalone: Risk level to Risk level", cex.main = 1)
# png(file="figures/standalone_risk_to_risk.png")
dev.off()
## null device 
##           1

This chunk produces across population categories. Outside the top100 is excluded.

#within same population category
data <- moves %>%
  filter(pop_tercile_num_old==pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Same pop category: Risk level to Risk level")
Same pop category: Risk level to Risk level
avg_eal_tercile_old High Low Moderate Outside
High 0.4685364 0.0425758 0.1014529 NA
Low 0.0436896 0.0349314 0.0807472 0.0006075
Moderate 0.0831266 0.0680909 0.0714322 0.0016706
Outside NA 0.0004556 0.0026831 NA
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Same pop category: Risk level to Risk level", cex.main = 1)

# png(file="figures/samepop_risk_to_risk.png")

#lower pop to higher pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Towards higher pop: Risk level to Risk level")
Towards higher pop: Risk level to Risk level
avg_eal_tercile_old High Low Moderate Outside
High 0.3667207 0.0683614 0.1417865 0.0037472
Low 0.0196476 0.0562082 0.0668422 0.0085072
Moderate 0.0282560 0.0941868 0.1438120 0.0019242
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Towards higher pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/low_to_high_risk_to_risk.png")

#higher pop to lower pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Toward lower pop: Risk level to Risk level")
Toward lower pop: Risk level to Risk level
avg_eal_tercile_old High Low Moderate
High 0.3675349 0.0159453 0.0325839
Low 0.0676439 0.0499158 0.1152818
Moderate 0.1219174 0.0842825 0.1305338
Outside 0.0036645 0.0092107 0.0014856
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Toward lower pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/high_to_low_risk_to_risk.png")

This chunk produces across population categories. Inclusive of top100.

#within same population category
data <- moves %>%
  filter(pop_tercile_num_old==pop_tercile_num_new|avg_eal_tercile_old=="Outside"|avg_eal_tercile_new=="Outside") %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Same pop category: Risk level to Risk level")
Same pop category: Risk level to Risk level
avg_eal_tercile_old High Low Moderate Outside
High 0.0937832 0.0085221 0.0203070 0.0899833
Low 0.0087450 0.0069919 0.0161625 0.0712266
Moderate 0.0166388 0.0136292 0.0142980 0.0758170
Outside 0.0858489 0.0781679 0.0833055 0.3165729
table <- moves %>%
  filter(avg_eal_tercile_old=="Outside") %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Same pop category:Risk of Destinations outside top100")
table
Same pop category:Risk of Destinations outside top100
avg_eal_tercile_all row_count
High 13625
Low 8181
Moderate 9435
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Same pop category: Risk level to Risk level", cex.main = 1)

# png(file="figures/samepop_risk_to_risk.png")

#lower pop to higher pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Towards higher pop: Risk level to Risk level")
Towards higher pop: Risk level to Risk level
avg_eal_tercile_old High Low Moderate Outside
High 0.3667207 0.0683614 0.1417865 0.0037472
Low 0.0196476 0.0562082 0.0668422 0.0085072
Moderate 0.0282560 0.0941868 0.1438120 0.0019242
table <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Towards higher pop:Risk of Destinations outside top100")
table
Towards higher pop:Risk of Destinations outside top100
avg_eal_tercile_all row_count
Moderate 140
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Towards higher pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/low_to_high_risk_to_risk.png")

#higher pop to lower pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Toward lower pop: Risk level to Risk level")
Toward lower pop: Risk level to Risk level
avg_eal_tercile_old High Low Moderate
High 0.3675349 0.0159453 0.0325839
Low 0.0676439 0.0499158 0.1152818
Moderate 0.1219174 0.0842825 0.1305338
Outside 0.0036645 0.0092107 0.0014856
table <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Towards lower pop:Risk of Destinations outside top100")
table
Towards lower pop:Risk of Destinations outside top100
avg_eal_tercile_all row_count
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Toward lower pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/high_to_low_risk_to_risk.png")

This chunk produces for same population moves, crit/non, chain/non-chain

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_crit_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Critical Same Population Moves: Risk level to Risk level")
Critical Same Population Moves: Risk level to Risk level
avg_eal_tercile_old High Low Moderate
High 0.4626455 0.0411587 0.0918508
Low 0.0439087 0.0399670 0.0880924
Moderate 0.0735173 0.0747090 0.0841507
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Critical Same Population Moves: Risk level to Risk level", cex.main = 1)

# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device 
##           1
#noncrit
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_non_crit_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Non-Critical Same Population Moves: Risk level to Risk level")
Non-Critical Same Population Moves: Risk level to Risk level
avg_eal_tercile_old High Low Moderate
High 0.4721569 0.0495238 0.1001681
Low 0.0476190 0.0302521 0.0770868
Moderate 0.0834734 0.0676751 0.0720448
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Non-Critical Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device 
##           1
#chain

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_chain_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Chain Same Population Moves: Risk level to Risk level")
Chain Same Population Moves: Risk level to Risk level
avg_eal_tercile_old High Low Moderate
High 0.3718961 0.0741097 0.1376323
Low 0.0666025 0.0381136 0.0694899
Moderate 0.1054860 0.0625602 0.0741097
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Chain Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device 
##           1
#standalone
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_standalone_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Standalone Same Population Moves: Risk level to Risk level")
Standalone Same Population Moves: Risk level to Risk level
avg_eal_tercile_old High Low Moderate
High 0.5006490 0.0345652 0.0806749
Low 0.0381174 0.0347018 0.0879842
Moderate 0.0682424 0.0747319 0.0803334
chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Standalone Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device 
##           1

Bar Charts

This chunk produces bar graphs for all 100 CBSAs and corresponding entries/exits and births/deaths for all years of data

library(ggpattern)
## Warning: package 'ggpattern' was built under R version 4.4.1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(data)
##  archive_version_year     CBSAFP          entry             exit       
##  Min.   :2013         Min.   :10420   Min.   :  1.00   Min.   :  1.00  
##  1st Qu.:2015         1st Qu.:19348   1st Qu.: 26.00   1st Qu.: 25.00  
##  Median :2018         Median :32060   Median : 45.00   Median : 44.00  
##  Mean   :2018         Mean   :30209   Mean   : 67.08   Mean   : 67.08  
##  3rd Qu.:2020         3rd Qu.:40080   3rd Qu.: 84.00   3rd Qu.: 79.00  
##  Max.   :2022         Max.   :49340   Max.   :561.00   Max.   :670.00  
##                                                        NA's   :100     
##      births           deaths           NAME              avg_eal     
##  Min.   :  1328   Min.   :  1128   Length:1000        Min.   :38.21  
##  1st Qu.:  3617   1st Qu.:  3416   Class :character   1st Qu.:65.60  
##  Median :  6166   Median :  5294   Mode  :character   Median :75.28  
##  Mean   : 12411   Mean   : 10836                      Mean   :77.28  
##  3rd Qu.: 13648   3rd Qu.: 11576                      3rd Qu.:94.11  
##  Max.   :171670   Max.   :139494                      Max.   :99.91  
##                   NA's   :100                                        
##     tercile     avg_eal_tercile   
##  Min.   :1.00   Length:1000       
##  1st Qu.:1.00   Class :character  
##  Median :2.00   Mode  :character  
##  Mean   :1.99                     
##  3rd Qu.:3.00                     
##  Max.   :3.00                     
## 
data <- data %>% gather(type,row_count, c(entry,exit,births,deaths)) %>%
  group_by(NAME,avg_eal_tercile,tercile,type) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  mutate(label= paste0(type,"-",avg_eal_tercile))
## `summarise()` has grouped output by 'NAME', 'avg_eal_tercile', 'tercile'. You
## can override using the `.groups` argument.
color_ramp <- c("births-Low" = "#00b100", "births-Moderate" = "#007e00", "births-High" = "#004b00",
          "deaths-Low" = "#c600c6", "deaths-Moderate" = "#9f009f", "deaths-High" = "#780078")
data %>%
  filter(type %in% c("births", "deaths")) %>%
  group_by(NAME) %>%
  mutate(sort = sum(row_count,na.rm = T)) %>%
  ungroup() %>%
  ggplot(aes(x = fct_reorder(NAME, sort), y = row_count, fill = label)) + 
  geom_bar(position = "dodge", stat = "identity")+
  coord_flip()+
  theme_minimal()+
  scale_fill_manual(values=color_ramp)+
  ggtitle("Births and Deaths - 2013-2022")+
  theme(legend.position = "bottom")

# ggsave("figures/bar_chart_births_deaths.png")

color_ramp <- c("entry-Low" = "#00b100", "entry-Moderate" = "#007e00", "entry-High" = "#004b00",
          "exit-Low" = "#c600c6", "exit-Moderate" = "#9f009f", "exit-High" = "#780078")
data %>% filter(type %in% c("entry","exit")) %>%
  group_by(NAME) %>%
  mutate(sort = sum(row_count,na.rm = T)) %>%
  ungroup() %>%
  ggplot(aes(fill = label, y = row_count, x = forcats::fct_reorder(NAME, sort))) + 
  geom_bar(position = "dodge", stat = "identity")+
  coord_flip()+
  theme_minimal()+
  scale_fill_manual(values=color_ramp)+
  ggtitle("Entries and Exits - 2013-2022")+
  theme(legend.position = "bottom")

# ggsave("figures/bar_chart_entries_exits.png")
library(ggrepel)  
# Load and preprocess data (using your original code)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv") %>%
  gather(type, row_count, c(entry, exit, births, deaths)) %>%
  group_by(NAME, avg_eal_tercile, tercile, type) %>%
  summarise(row_count = sum(row_count, na.rm = TRUE)) %>%
  mutate(label = paste0(type, "-", avg_eal_tercile))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'NAME', 'avg_eal_tercile', 'tercile'. You can override using the `.groups` argument.
# Pivot to get births/deaths terciles per CBSA
plot_data <- data %>%
  filter(type %in% c("births", "deaths")) %>%
  select(NAME, type, tercile) %>%
  pivot_wider(names_from = type, values_from = tercile) %>%
  mutate(
    # Convert terciles to numeric positions (Moderate = 0)
    x = case_when(
      births == 1 ~ -1,
      births == 2 ~ 0,
      births == 3 ~ 1
    ),
    y = case_when(
      deaths == 1 ~ -1,
      deaths == 2 ~ 0,
      deaths == 3 ~ 1
    )
  )
## Adding missing grouping variables: `avg_eal_tercile`
# Plot
ggplot(plot_data, aes(x = x, y = y)) +
  # Reference lines (center at Moderate)
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray70") +
  # Points (smaller size = 2)
  geom_point(aes(color = avg_eal_tercile), size = 2, alpha = 0.8) +
  # Non-overlapping labels
  geom_text_repel(
    aes(label = NAME),
    size = 2.5,
    max.overlaps = 20,
    segment.size = 0.2
  ) +
  # Axis labels (discrete scale)
  scale_x_continuous(
    breaks = c(-1, 0, 1),
    labels = c("Low Births", "Moderate Births", "High Births")
  ) +
  scale_y_continuous(
    breaks = c(-1, 0, 1),
    labels = c("Low Deaths", "Moderate Deaths", "High Deaths")
  ) +
  # Color scheme (EAL terciles)
  scale_color_manual(
    values = c("Low" = "#00b100", "Moderate" = "#007e00", "High" = "#004b00")
  ) +
  labs(
    title = "Births vs. Deaths by CBSA (Tercile Comparison)",
    x = "",
    y = "",
    color = "EAL Tercile"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    panel.grid.major = element_blank()  # Remove gridlines for clarity
  )
## Warning: ggrepel: 100 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Expansion Test

expansion_size <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_rawdata.csv") %>%
  mutate(growth_count=employee_size_location-lag_employees,
         growth_pct=growth_count/lag_employees)
## Rows: 100600 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): company, avg_eal_tercile, avg_eal_tercile_parent, parent_region
## dbl (13): archive_version_year, parent_number, fips_code, primary_naics_code...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(expansion_size$growth_count)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     1.00     4.00    23.62    11.00 64975.00
expansion_size %>%
  filter(growth_count <= quantile(growth_count, 0.95)) %>%
  ggplot()+geom_histogram(aes(growth_count))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(expansion_size$growth_pct)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.001     0.333     0.667     3.626     2.000 10999.000
expansion_size %>%
  filter(growth_pct <= quantile(growth_pct, 0.95)) %>%
  ggplot()+geom_histogram(aes(growth_pct))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

process_data <- function(file_path, caption, include_yearly_column = FALSE) {
  table <- read_csv(file_path) %>%
    mutate(
      avg_eal_tercile_parent = ifelse(is.na(avg_eal_tercile_parent), "Outside100", avg_eal_tercile_parent),
      Total = Low + Moderate + High,
      Low_pct = round(Low / Total, 2),
      Moderate_pct = round(Moderate / Total, 2),
      High_pct = round(High / Total, 2),
    )
  
  if (include_yearly_column) {
    table <- table %>%
      rename(year=archive_version_year) %>%
      select(year,avg_eal_tercile_parent, Low, Low_pct, Moderate, Moderate_pct, High, High_pct, Total)
  } else {
      table <- table %>%
      select(avg_eal_tercile_parent, Low, Low_pct, Moderate, Moderate_pct, High, High_pct, Total)
  }
  
  kable(table, caption = caption)
}



# All expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all.csv", "ALL Expansions, rows (hq), columns (branch)")
## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
ALL Expansions, rows (hq), columns (branch)
avg_eal_tercile_parent Low Low_pct Moderate Moderate_pct High High_pct Total
High 3343 0.22 4861 0.32 6970 0.46 15174
Low 5331 0.40 4463 0.34 3412 0.26 13206
Moderate 2857 0.24 5810 0.50 3049 0.26 11716
Outside100 4421 0.31 5441 0.38 4497 0.31 14359
# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly.csv", "A<< Expansions yearly, rows (hq), columns (branch)", TRUE)
## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
A<< Expansions yearly, rows (hq), columns (branch)
year avg_eal_tercile_parent Low Low_pct Moderate Moderate_pct High High_pct Total
2013 High 678 0.22 1057 0.34 1356 0.44 3091
2013 Low 673 0.36 630 0.34 558 0.30 1861
2013 Moderate 570 0.26 986 0.45 657 0.30 2213
2013 Outside100 952 0.31 1114 0.36 1028 0.33 3094
2014 High 202 0.20 314 0.31 498 0.49 1014
2014 Low 237 0.43 177 0.32 138 0.25 552
2014 Moderate 205 0.23 453 0.51 237 0.26 895
2014 Outside100 272 0.31 325 0.37 273 0.31 870
2015 High 201 0.25 244 0.31 351 0.44 796
2015 Low 355 0.41 297 0.34 214 0.25 866
2015 Moderate 173 0.23 398 0.54 169 0.23 740
2015 Outside100 212 0.28 301 0.40 245 0.32 758
2016 High 1014 0.21 1502 0.31 2282 0.48 4798
2016 Low 1289 0.43 929 0.31 775 0.26 2993
2016 Moderate 849 0.24 1763 0.50 929 0.26 3541
2016 Outside100 1221 0.29 1622 0.38 1408 0.33 4251
2017 High 230 0.23 321 0.32 449 0.45 1000
2017 Low 313 0.46 217 0.32 154 0.23 684
2017 Moderate 172 0.23 390 0.53 176 0.24 738
2017 Outside100 309 0.32 359 0.38 288 0.30 956
2018 High 163 0.21 249 0.32 366 0.47 778
2018 Low 250 0.44 183 0.32 135 0.24 568
2018 Moderate 180 0.25 362 0.50 180 0.25 722
2018 Outside100 408 0.33 490 0.40 341 0.28 1239
2019 High 119 0.25 148 0.31 210 0.44 477
2019 Low 131 0.43 101 0.33 71 0.23 303
2019 Moderate 97 0.27 172 0.47 96 0.26 365
2019 Outside100 124 0.32 156 0.40 112 0.29 392
2020 High 107 0.21 173 0.33 238 0.46 518
2020 Low 155 0.40 126 0.33 104 0.27 385
2020 Moderate 79 0.25 158 0.50 80 0.25 317
2020 Outside100 120 0.32 153 0.40 106 0.28 379
2021 High 121 0.25 152 0.31 217 0.44 490
2021 Low 210 0.40 149 0.28 170 0.32 529
2021 Moderate 83 0.28 158 0.54 54 0.18 295
2021 Outside100 114 0.28 148 0.36 145 0.36 407
2022 High 508 0.23 701 0.32 1003 0.45 2212
2022 Low 1718 0.38 1654 0.37 1093 0.24 4465
2022 Moderate 449 0.24 970 0.51 471 0.25 1890
2022 Outside100 689 0.34 773 0.38 551 0.27 2013
# All expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all_10.csv", "Expansions, rows (hq), columns (branch)")
## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Expansions, rows (hq), columns (branch)
avg_eal_tercile_parent Low Low_pct Moderate Moderate_pct High High_pct Total
High 979 0.21 1424 0.31 2227 0.48 4630
Low 1383 0.47 868 0.29 713 0.24 2964
Moderate 743 0.23 1718 0.53 783 0.24 3244
Outside100 1252 0.31 1530 0.38 1253 0.31 4035
# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly_10.csv", "Expansions yearly, rows (hq), columns (branch)", TRUE)
## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Expansions yearly, rows (hq), columns (branch)
year avg_eal_tercile_parent Low Low_pct Moderate Moderate_pct High High_pct Total
2013 High 225 0.22 346 0.33 466 0.45 1037
2013 Low 211 0.39 159 0.29 169 0.31 539
2013 Moderate 150 0.24 287 0.46 189 0.30 626
2013 Outside100 323 0.30 374 0.35 384 0.36 1081
2014 High 49 0.19 77 0.30 134 0.52 260
2014 Low 71 0.44 55 0.34 37 0.23 163
2014 Moderate 41 0.19 128 0.60 43 0.20 212
2014 Outside100 73 0.28 93 0.36 92 0.36 258
2015 High 47 0.20 68 0.30 115 0.50 230
2015 Low 70 0.42 50 0.30 48 0.29 168
2015 Moderate 49 0.25 96 0.49 51 0.26 196
2015 Outside100 58 0.26 88 0.39 77 0.35 223
2016 High 269 0.21 401 0.31 612 0.48 1282
2016 Low 303 0.47 190 0.30 151 0.23 644
2016 Moderate 214 0.24 461 0.52 219 0.24 894
2016 Outside100 255 0.30 334 0.40 251 0.30 840
2017 High 84 0.21 130 0.32 191 0.47 405
2017 Low 119 0.49 72 0.30 53 0.22 244
2017 Moderate 47 0.22 136 0.63 32 0.15 215
2017 Outside100 129 0.36 128 0.36 103 0.29 360
2018 High 60 0.19 93 0.30 161 0.51 314
2018 Low 104 0.50 60 0.29 45 0.22 209
2018 Moderate 56 0.21 140 0.53 67 0.25 263
2018 Outside100 110 0.34 133 0.41 82 0.25 325
2019 High 34 0.24 39 0.27 69 0.49 142
2019 Low 59 0.46 41 0.32 28 0.22 128
2019 Moderate 30 0.27 56 0.51 24 0.22 110
2019 Outside100 49 0.30 70 0.43 45 0.27 164
2020 High 24 0.16 40 0.27 83 0.56 147
2020 Low 49 0.49 35 0.35 16 0.16 100
2020 Moderate 21 0.23 55 0.60 16 0.17 92
2020 Outside100 36 0.29 60 0.48 28 0.23 124
2021 High 43 0.30 33 0.23 65 0.46 141
2021 Low 45 0.46 26 0.27 27 0.28 98
2021 Moderate 23 0.25 56 0.60 14 0.15 93
2021 Outside100 32 0.29 36 0.33 41 0.38 109
2022 High 144 0.21 197 0.29 331 0.49 672
2022 Low 352 0.52 180 0.27 139 0.21 671
2022 Moderate 112 0.21 303 0.56 128 0.24 543
2022 Outside100 187 0.34 214 0.39 150 0.27 551
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all_20.csv", "Expansions, rows (hq), columns (branch)")
## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Expansions, rows (hq), columns (branch)
avg_eal_tercile_parent Low Low_pct Moderate Moderate_pct High High_pct Total
High 529 0.20 819 0.31 1337 0.50 2685
Low 886 0.49 522 0.29 409 0.23 1817
Moderate 474 0.23 1127 0.54 480 0.23 2081
Outside100 709 0.29 1006 0.41 723 0.30 2438
# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly_20.csv", "Expansions yearly, rows (hq), columns (branch)", TRUE)
## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Expansions yearly, rows (hq), columns (branch)
year avg_eal_tercile_parent Low Low_pct Moderate Moderate_pct High High_pct Total
2013 High 121 0.20 200 0.33 281 0.47 602
2013 Low 115 0.41 78 0.28 90 0.32 283
2013 Moderate 74 0.22 159 0.48 99 0.30 332
2013 Outside100 152 0.26 242 0.41 194 0.33 588
2014 High 25 0.15 50 0.30 92 0.55 167
2014 Low 53 0.49 37 0.34 19 0.17 109
2014 Moderate 25 0.18 87 0.63 26 0.19 138
2014 Outside100 51 0.28 74 0.41 57 0.31 182
2015 High 20 0.14 44 0.31 76 0.54 140
2015 Low 47 0.44 36 0.34 24 0.22 107
2015 Moderate 26 0.22 60 0.51 32 0.27 118
2015 Outside100 33 0.23 62 0.44 47 0.33 142
2016 High 140 0.20 220 0.32 327 0.48 687
2016 Low 185 0.45 122 0.29 108 0.26 415
2016 Moderate 160 0.27 302 0.51 130 0.22 592
2016 Outside100 148 0.27 231 0.43 163 0.30 542
2017 High 53 0.20 83 0.32 127 0.48 263
2017 Low 86 0.52 51 0.31 27 0.16 164
2017 Moderate 28 0.20 93 0.65 22 0.15 143
2017 Outside100 71 0.34 77 0.37 59 0.29 207
2018 High 42 0.20 60 0.29 104 0.50 206
2018 Low 82 0.49 51 0.31 33 0.20 166
2018 Moderate 42 0.21 106 0.54 49 0.25 197
2018 Outside100 63 0.34 73 0.39 49 0.26 185
2019 High 23 0.27 24 0.28 39 0.45 86
2019 Low 45 0.49 29 0.32 18 0.20 92
2019 Moderate 19 0.28 34 0.50 15 0.22 68
2019 Outside100 37 0.33 47 0.42 27 0.24 111
2020 High 14 0.15 20 0.22 58 0.63 92
2020 Low 31 0.49 25 0.40 7 0.11 63
2020 Moderate 13 0.23 32 0.56 12 0.21 57
2020 Outside100 25 0.31 37 0.46 19 0.23 81
2021 High 17 0.24 14 0.20 40 0.56 71
2021 Low 26 0.57 7 0.15 13 0.28 46
2021 Moderate 17 0.27 36 0.56 11 0.17 64
2021 Outside100 18 0.33 17 0.31 20 0.36 55
2022 High 74 0.20 104 0.28 193 0.52 371
2022 Low 216 0.58 86 0.23 70 0.19 372
2022 Moderate 70 0.19 218 0.59 84 0.23 372
2022 Outside100 111 0.32 146 0.42 88 0.26 345