Descriptive States

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv")

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

table1 <- data %>%
  mutate(outside_top_100=ifelse(is.na(avg_eal_tercile),"outside_top_100","within_top_100")) %>%
  group_by(archive_version_year,outside_top_100) %>%
  summarise(rows=sum(rows,na.rm = TRUE)) %>%
  pivot_wider(id_cols = archive_version_year,names_from = outside_top_100,values_from = rows) %>%
  mutate(outside_prop=outside_top_100/(outside_top_100+within_top_100),
         inside_prop=within_top_100/(outside_top_100+within_top_100))

## `summarise()` has grouped output by 'archive_version_year'. You can override
## using the `.groups` argument.

kable(table1,caption="Share of establishments in/out top 100 MSAs")

Share of establishments in/out top 100 MSAs
archive_version_year	outside_top_100	within_top_100	outside_prop	inside_prop
2013	5516520	10437838	0.3457688	0.6542312
2014	5573021	10343882	0.3501322	0.6498678
2015	5463901	10165015	0.3496020	0.6503980
2016	5423220	10293413	0.3450625	0.6549375
2017	5124766	9702386	0.3456339	0.6543661
2018	5050960	9648466	0.3436161	0.6563839
2019	5151613	10345141	0.3324317	0.6675683
2020	5139349	10385438	0.3310415	0.6689585
2021	5247277	10705948	0.3289164	0.6710836
2022	5472590	11401758	0.3243142	0.6756858

moves <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/migration_map.csv") %>%
  mutate(old_outside_top_100=ifelse(is.na(avg_eal_tercile_old),"outside_top_100","within_top_100")) %>%
  mutate(new_outside_top_100=ifelse(is.na(avg_eal_tercile_new),"outside_top_100","within_top_100")) %>%
  mutate(type=paste0(old_outside_top_100,"_to_",new_outside_top_100)) %>%
  group_by(archive_version_year,type) %>%
  summarise(moves=sum(row_count,na.rm = TRUE)) %>%
  pivot_wider(id_cols = archive_version_year,names_from = type,values_from = moves) %>%
  rowwise() %>%
  mutate(total = sum(c_across(-1), na.rm = TRUE))

## Rows: 63970 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): fips_code, lead_county, avg_eal_tercile_old, NAME_old, avg_eal_ter...
## dbl (12): archive_version_year, row_count, CBSAFP_old, CBSAFP_new, avg_eal_o...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'archive_version_year'. You can override using the `.groups` argument.

kable(moves,caption="Moves in.out top 100 MSAs")

Moves in.out top 100 MSAs
archive_version_year	outside_top_100_to_outside_top_100	outside_top_100_to_within_top_100	within_top_100_to_outside_top_100	within_top_100_to_within_top_100	total
2012	3317	2218	2054	3635	7907
2013	3422	2435	2252	3963	8650
2014	2876	1899	1868	3078	6845
2015	2415	1740	1549	2642	5931
2016	3372	2437	2190	3948	8575
2017	2904	2411	2248	3944	8603
2018	4045	3731	3342	6559	13632
2019	3412	2721	2812	4895	10428
2020	2525	2143	2195	4230	8568
2021	2953	2672	2881	5778	11331

moves_prop <- moves %>%
  mutate(prop_outside_top_100_to_outside_top_100=outside_top_100_to_outside_top_100/total,
         prop_outside_top_100_to_within_top_100=outside_top_100_to_within_top_100/total,
         prop_within_top_100_to_outside_top_100=within_top_100_to_outside_top_100/total,
         prop_within_top_100_to_within_top_100=within_top_100_to_within_top_100/total) %>%
    select(-c(2:5))

kable(moves,caption="Shares of moves in.out top 100 MSAs")

Shares of moves in.out top 100 MSAs
archive_version_year	outside_top_100_to_outside_top_100	outside_top_100_to_within_top_100	within_top_100_to_outside_top_100	within_top_100_to_within_top_100	total
2012	3317	2218	2054	3635	7907
2013	3422	2435	2252	3963	8650
2014	2876	1899	1868	3078	6845
2015	2415	1740	1549	2642	5931
2016	3372	2437	2190	3948	8575
2017	2904	2411	2248	3944	8603
2018	4045	3731	3342	6559	13632
2019	3412	2721	2812	4895	10428
2020	2525	2143	2195	4230	8568
2021	2953	2672	2881	5778	11331

Census

3 is highest risk, 2 is moderate, 1 is low

table <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/census.csv")

## Rows: 4 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): avg_eal_tercile
## dbl (18): asian, avghhsize, bacplus, black, child, density, hispanic, medgro...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

table2 <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/industry.csv")

## Rows: 4 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): avg_eal_tercile
## dbl (14): FIRE, agriculutre_forestry_etc, armed_forces, arts_ent_rec_etc, co...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

kable(table,caption="Census Variables by Risk Level")

Census Variables by Risk Level
avg_eal_tercile	asian	avghhsize	bacplus	black	child	density	hispanic	medgrossrents	medhhinc	medianhhval	other	owner_occupied	poverty	senior	total	unemp	vacant	white
1	0.0315712	2.492941	0.3567584	0.1567143	0.2055315	345.1562	0.1144124	1042.8824	68181.35	219141.2	0.0381176	0.6606767	0.1227416	0.1556827	1494376.3	0.0503188	0.0879593	0.6591846
2	0.0374453	2.589091	0.3574478	0.1343262	0.2111732	604.2491	0.1572416	1089.0303	71106.09	240212.1	0.0427232	0.6623240	0.1234514	0.1502595	2060857.9	0.0543446	0.0858594	0.6282638
3	0.0889935	2.749091	0.3387351	0.0946081	0.2056592	828.4572	0.2888781	1412.9091	75917.24	387818.2	0.0494946	0.6279769	0.1258272	0.1651189	3149811.3	0.0588635	0.1053852	0.4780258
Outside	0.0150771	2.527414	0.2405747	0.0833987	0.2033337	115.2304	0.1335524	854.9571	56550.76	176321.7	0.0440430	0.6892702	0.1554496	0.1803648	109954.8	0.0553571	0.1511438	0.7239288

kable(table2,caption="Census Industry by Risk Level")

Census Industry by Risk Level
avg_eal_tercile	FIRE	agriculutre_forestry_etc	armed_forces	arts_ent_rec_etc	construction	education_healthcare_etc	information	manufacturing	other	professional_science_managment_etc	public_admin	retail	transportation_warehouse_utilities	wholesale
1	0.0722513	0.0089891	0.0099165	0.0835972	0.0632696	0.2451436	0.0180693	0.0957320	0.0461523	0.1176257	0.0523191	0.1067635	0.0549619	0.0252088
2	0.0703715	0.0083459	0.0074881	0.0846070	0.0643641	0.2331948	0.0176900	0.1133960	0.0466104	0.1192654	0.0454758	0.1095589	0.0535857	0.0260463
3	0.0670700	0.0173628	0.0086841	0.1020863	0.0728896	0.2182436	0.0198586	0.0764031	0.0474106	0.1310278	0.0458772	0.1119322	0.0562385	0.0249155
Outside	0.0444278	0.0391243	0.0089660	0.0892926	0.0697175	0.2388585	0.0121106	0.1328069	0.0469522	0.0739920	0.0527425	0.1165147	0.0529105	0.0215839

T-tests 3 is highest risk, 2 is moderate, 1 is low

library(rstatix)

## 
## Attaching package: 'rstatix'

## The following object is masked from 'package:stats':
## 
##     filter

table <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/census_cbsas.csv")

## Rows: 939 Columns: 19

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): avg_eal_tercile
## dbl (18): asian, avghhsize, black, hispanic, medgrossrents, medhhinc, median...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

names <- names(table)[2:19]
table_tidy <- table %>% pivot_longer(!avg_eal_tercile)
for (i in names) {
  print(i)
  x <- table_tidy %>%
    filter(name==i) %>%
    mutate(avg_eal_tercile=as.character(avg_eal_tercile)) %>%
    select(-name)
  pairwise_result <- x %>% pairwise_t_test(value~avg_eal_tercile, p.adjust.method = "bonferroni") %>%
    mutate(variable=i) %>%
    select(variable, everything()) %>%
    select(-.y.)
  print(pairwise_result)
}

## [1] "asian"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 asian    1      2          34    33 3.9 e- 1 ns       1   e+ 0 ns          
## 2 asian    1      3          34    33 1.55e-16 ****     9.31e-16 ****        
## 3 asian    2      3          33    33 1.57e-13 ****     9.45e-13 ****        
## 4 asian    1      Outside    34   839 7.73e- 4 ***      4.64e- 3 **          
## 5 asian    2      Outside    33   839 7.32e- 6 ****     4.39e- 5 ****        
## 6 asian    3      Outside    33   839 3.22e-45 ****     1.93e-44 ****        
## [1] "avghhsize"
## # A tibble: 6 × 9
##   variable  group1 group2     n1    n2           p p.signif   p.adj p.adj.signif
##   <chr>     <chr>  <chr>   <int> <int>       <dbl> <chr>      <dbl> <chr>       
## 1 avghhsize 1      2          34    33     7.5 e-2 ns       4.5 e-1 ns          
## 2 avghhsize 1      3          34    33     2.37e-6 ****     1.42e-5 ****        
## 3 avghhsize 2      3          33    33     3.32e-3 **       1.99e-2 *           
## 4 avghhsize 1      Outside    34   839     3.72e-1 ns       1   e+0 ns          
## 5 avghhsize 2      Outside    33   839     1.16e-1 ns       6.94e-1 ns          
## 6 avghhsize 3      Outside    33   839     2.03e-8 ****     1.22e-7 ****        
## [1] "black"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif   p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>      <dbl> <chr>       
## 1 black    1      2          34    33 0.462    ns       1       ns          
## 2 black    1      3          34    33 0.0416   *        0.25    ns          
## 3 black    2      3          33    33 0.196    ns       1       ns          
## 4 black    1      Outside    34   839 0.000798 ***      0.00479 **          
## 5 black    2      Outside    33   839 0.0214   *        0.129   ns          
## 6 black    3      Outside    33   839 0.612    ns       1       ns          
## [1] "hispanic"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2         p p.signif     p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>     <dbl> <chr>        <dbl> <chr>       
## 1 hispanic 1      2          34    33 0.354     ns       1         ns          
## 2 hispanic 1      3          34    33 0.00017   ***      0.00102   **          
## 3 hispanic 2      3          33    33 0.00479   **       0.0287    *           
## 4 hispanic 1      Outside    34   839 0.563     ns       1         ns          
## 5 hispanic 2      Outside    33   839 0.48      ns       1         ns          
## 6 hispanic 3      Outside    33   839 0.0000042 ****     0.0000252 ****        
## [1] "medgrossrents"
## # A tibble: 6 × 9
##   variable     group1 group2    n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>        <chr>  <chr>  <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 medgrossren… 1      2         34    33 3.75e- 1 ns       1   e+ 0 ns          
## 2 medgrossren… 1      3         34    33 2.24e-12 ****     1.34e-11 ****        
## 3 medgrossren… 2      3         33    33 9.51e-10 ****     5.71e- 9 ****        
## 4 medgrossren… 1      Outsi…    34   839 5.40e- 7 ****     3.24e- 6 ****        
## 5 medgrossren… 2      Outsi…    33   839 8.64e-10 ****     5.18e- 9 ****        
## 6 medgrossren… 3      Outsi…    33   839 1.57e-44 ****     9.40e-44 ****        
## [1] "medhhinc"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 medhhinc 1      2          34    33 3.35e- 1 ns       1   e+ 0 ns          
## 2 medhhinc 1      3          34    33 1.09e- 2 *        6.54e- 2 ns          
## 3 medhhinc 2      3          33    33 1.16e- 1 ns       6.94e- 1 ns          
## 4 medhhinc 1      Outside    34   839 1.06e- 7 ****     6.39e- 7 ****        
## 5 medhhinc 2      Outside    33   839 6.5 e-11 ****     3.90e-10 ****        
## 6 medhhinc 3      Outside    33   839 6.88e-18 ****     4.13e-17 ****        
## [1] "medianhhval"
## # A tibble: 6 × 9
##   variable    group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>       <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 medianhhval 1      2          34    33 4.07e- 1 ns       1   e+ 0 ns          
## 2 medianhhval 1      3          34    33 5.51e-11 ****     3.31e-10 ****        
## 3 medianhhval 2      3          33    33 1.12e- 8 ****     6.74e- 8 ****        
## 4 medianhhval 1      Outside    34   839 1.89e- 2 *        1.13e- 1 ns          
## 5 medianhhval 2      Outside    33   839 5.64e- 4 ***      3.38e- 3 **          
## 6 medianhhval 3      Outside    33   839 1.61e-28 ****     9.68e-28 ****        
## [1] "owner_occupied"
## # A tibble: 6 × 9
##   variable       group1 group2    n1    n2       p p.signif   p.adj p.adj.signif
##   <chr>          <chr>  <chr>  <int> <int>   <dbl> <chr>      <dbl> <chr>       
## 1 owner_occupied 1      2         34    33 9.14e-1 ns       1   e+0 ns          
## 2 owner_occupied 1      3         34    33 3.16e-2 *        1.89e-1 ns          
## 3 owner_occupied 2      3         33    33 2.5 e-2 *        1.5 e-1 ns          
## 4 owner_occupied 1      Outsi…    34   839 8.68e-3 **       5.21e-2 ns          
## 5 owner_occupied 2      Outsi…    33   839 1.48e-2 *        8.85e-2 ns          
## 6 owner_occupied 3      Outsi…    33   839 3.58e-8 ****     2.15e-7 ****        
## [1] "poverty"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2       p p.signif  p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>   <dbl> <chr>     <dbl> <chr>       
## 1 poverty  1      2          34    33 0.963   ns       1      ns          
## 2 poverty  1      3          34    33 0.839   ns       1      ns          
## 3 poverty  2      3          33    33 0.877   ns       1      ns          
## 4 poverty  1      Outside    34   839 0.00275 **       0.0165 *           
## 5 poverty  2      Outside    33   839 0.00388 **       0.0233 *           
## 6 poverty  3      Outside    33   839 0.00749 **       0.0449 *           
## [1] "total"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 total    1      2          34    33 8.36e- 3 **       5.01e- 2 ns          
## 2 total    1      3          34    33 2.91e-14 ****     1.74e-13 ****        
## 3 total    2      3          33    33 5.50e- 7 ****     3.30e- 6 ****        
## 4 total    1      Outside    34   839 1.02e-18 ****     6.14e-18 ****        
## 5 total    2      Outside    33   839 2.02e-33 ****     1.21e-32 ****        
## 6 total    3      Outside    33   839 1.69e-71 ****     1.01e-70 ****        
## [1] "unemp"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2     p p.signif p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 unemp    1      2          34    33 0.444 ns       1     ns          
## 2 unemp    1      3          34    33 0.104 ns       0.626 ns          
## 3 unemp    2      3          33    33 0.394 ns       1     ns          
## 4 unemp    1      Outside    34   839 0.181 ns       1     ns          
## 5 unemp    2      Outside    33   839 0.791 ns       1     ns          
## 6 unemp    3      Outside    33   839 0.358 ns       1     ns          
## [1] "vacant"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2           p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>       <dbl> <chr>       <dbl> <chr>       
## 1 vacant   1      2          34    33 0.907       ns        1   e+0 ns          
## 2 vacant   1      3          34    33 0.333       ns        1   e+0 ns          
## 3 vacant   2      3          33    33 0.282       ns        1   e+0 ns          
## 4 vacant   1      Outside    34   839 0.00000109  ****      6.56e-6 ****        
## 5 vacant   2      Outside    33   839 0.000000693 ****      4.16e-6 ****        
## 6 vacant   3      Outside    33   839 0.000482    ***       2.89e-3 **          
## [1] "white"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 white    1      2          34    33 5.38e- 1 ns       1   e+ 0 ns          
## 2 white    1      3          34    33 3.22e- 4 ***      1.93e- 3 **          
## 3 white    2      3          33    33 3.04e- 3 **       1.82e- 2 *           
## 4 white    1      Outside    34   839 7.18e- 2 ns       4.31e- 1 ns          
## 5 white    2      Outside    33   839 8.8 e- 3 **       5.28e- 2 ns          
## 6 white    3      Outside    33   839 2.63e-11 ****     1.58e-10 ****        
## [1] "other"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2     p p.signif p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 other    1      2          34    33 0.721 ns           1 ns          
## 2 other    1      3          34    33 0.377 ns           1 ns          
## 3 other    2      3          33    33 0.602 ns           1 ns          
## 4 other    1      Outside    34   839 0.52  ns           1 ns          
## 5 other    2      Outside    33   839 0.888 ns           1 ns          
## 6 other    3      Outside    33   839 0.56  ns           1 ns          
## [1] "density"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 density  1      2          34    33 9.40e- 8 ****     5.64e- 7 ****        
## 2 density  1      3          34    33 1.4 e-22 ****     8.39e-22 ****        
## 3 density  2      3          33    33 4.34e- 6 ****     2.61e- 5 ****        
## 4 density  1      Outside    34   839 4.39e-11 ****     2.63e-10 ****        
## 5 density  2      Outside    33   839 1.74e-40 ****     1.05e-39 ****        
## 6 density  3      Outside    33   839 9.06e-77 ****     5.43e-76 ****        
## [1] "child"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2     p p.signif p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int> <dbl> <chr>    <dbl> <chr>       
## 1 child    1      2          34    33 0.433 ns       1     ns          
## 2 child    1      3          34    33 0.986 ns       1     ns          
## 3 child    2      3          33    33 0.447 ns       1     ns          
## 4 child    1      Outside    34   839 0.67  ns       1     ns          
## 5 child    2      Outside    33   839 0.134 ns       0.804 ns          
## 6 child    3      Outside    33   839 0.657 ns       1     ns          
## [1] "senior"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2         p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>     <dbl> <chr>       <dbl> <chr>       
## 1 senior   1      2          34    33 0.584     ns       1        ns          
## 2 senior   1      3          34    33 0.34      ns       1        ns          
## 3 senior   2      3          33    33 0.136     ns       0.817    ns          
## 4 senior   1      Outside    34   839 0.000513  ***      0.00308  **          
## 5 senior   2      Outside    33   839 0.0000303 ****     0.000182 ***         
## 6 senior   3      Outside    33   839 0.034     *        0.204    ns          
## [1] "bacplus"
## # A tibble: 6 × 9
##   variable group1 group2     n1    n2        p p.signif    p.adj p.adj.signif
##   <chr>    <chr>  <chr>   <int> <int>    <dbl> <chr>       <dbl> <chr>       
## 1 bacplus  1      2          34    33 9.74e- 1 ns       1   e+ 0 ns          
## 2 bacplus  1      3          34    33 3.89e- 1 ns       1   e+ 0 ns          
## 3 bacplus  2      3          33    33 3.74e- 1 ns       1   e+ 0 ns          
## 4 bacplus  1      Outside    34   839 2.09e-14 ****     1.25e-13 ****        
## 5 bacplus  2      Outside    33   839 3.39e-14 ****     2.03e-13 ****        
## 6 bacplus  3      Outside    33   839 1.58e-10 ****     9.5 e-10 ****

Line Graphs

Number of Business & Mean Establishment Year by risk level over year

Original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  filter(!is.na(avg_eal))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,rows,color=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Number of Businesses")
# ggsave("figures/number_of_businesses.png")
plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,mean_year_established,color=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Mean year established")
# ggsave("figures/mean_year_established.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

New

library(ggplot2)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  filter(!is.na(avg_eal))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile,
                               levels = c("High", "Moderate", "Low"),
                               labels = c("High Risk", "Moderate Risk", "Low Risk"))

### Plot 1: Number of Businesses

medians_business <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_business = median(rows, na.rm = TRUE), .groups = "drop")

# interactive plot (all individual lines in light gray plus the median line)
plot1_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = rows, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_business, aes(x = archive_version_year, y = median_business),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Number of Businesses",
       title = "Number of Businesses by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

plot1_interactive_plotly <- ggplotly(plot1_interactive)


# static plot with only the median line
plot1_static <- ggplot(medians_business, aes(x = archive_version_year, y = median_business)) +
  geom_line(data = medians_business, aes(x = archive_version_year, y = median_business),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Median Number of Businesses",
       title = "Median Number of Businesses by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# ggsave("figures/number_of_businesses_median.png", plot = plot1_static)

plot1_static_bar <- ggplot(medians_business, aes(x = archive_version_year, y = median_business)) +
  geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = labeller(avg_eal_tercile = c("High" = "High Risk", "Moderate" = "Moderate Risk", "Low" = "Low Risk"))) +
  labs(x = "Year", y = "Median Number of Businesses",
       title = "Median Number of Businesses by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### Plot 2: Mean Year Established

medians_year <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_year = median(mean_year_established, na.rm = TRUE), .groups = "drop")

# interactive plot (all individual lines in light gray plus the median line)
plot2_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = mean_year_established, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_year, aes(x = archive_version_year, y = median_year),
            color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Mean Year Established",
       title = "Mean Year Established by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot2_interactive_plotly <- ggplotly(plot2_interactive)

# static plot with only the median line for Mean Year Established
plot2_static <- ggplot(medians_year, aes(x = archive_version_year, y = median_year)) +
  geom_line(color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3) +
  labs(x = "Year", y = "Median Mean Year Established",
       title = "Median Mean Year Established by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# ggsave("figures/mean_year_established_median.png", plot = plot2_static)

plot1_interactive_plotly # interactive plot 1 will all + median

plot1_static # static plot 1 median only

plot1_interactive # static plot 1 with all + median

plot2_interactive_plotly # interactive plot 2 will all + median

plot2_static # static plot 2 with median only

plot2_interactive # static plot 2 with all + median

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

Business Events by risk level over year

Origianl

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Births")
# ggsave("figures/births.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Deaths")
# ggsave("figures/deaths.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Entries")
# ggsave("figures/entries.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =)+
  ggtitle("Exits")
# ggsave("figures/exits.png")

ggplotly(plot1)

ggplotly(plot2)

ggplotly(plot3)

ggplotly(plot4)

New

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
risk_labeller <- labeller(avg_eal_tercile = c("High" = "High Risk",
                                                "Moderate" = "Moderate Risk",
                                                "Low" = "Low Risk"))

### 1. Births

medians_births <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")

# Interactive plot: all individual lines (light gray) with median line (blue)
plot_births_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = births, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_births, aes(x = archive_version_year, y = median_births),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Births",
       title = "Business Births by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_births_interactive_plotly <- ggplotly(plot_births_interactive)

# Static plot: only the median line
plot_births_static <- ggplot(medians_births, aes(x = archive_version_year, y = median_births)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Births",
       title = "Median Business Births by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 2. Deaths

medians_deaths <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_deaths = median(deaths, na.rm = TRUE), .groups = "drop")

plot_deaths_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = deaths, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_deaths, aes(x = archive_version_year, y = median_deaths),
            color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Deaths",
       title = "Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_deaths_interactive_plotly <- ggplotly(plot_deaths_interactive)

plot_deaths_static <- ggplot(medians_deaths, aes(x = archive_version_year, y = median_deaths)) +
  geom_line(color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Deaths",
       title = "Median Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 3. Entries

medians_entries <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_entries = median(entry, na.rm = TRUE), .groups = "drop")

plot_entries_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = entry, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_entries, aes(x = archive_version_year, y = median_entries),
            color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Entries",
       title = "Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_entries_interactive_plotly <- ggplotly(plot_entries_interactive)

plot_entries_static <- ggplot(medians_entries, aes(x = archive_version_year, y = median_entries)) +
  geom_line(color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Entries",
       title = "Median Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 4. Exits

medians_exits <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_exits = median(exit, na.rm = TRUE), .groups = "drop")

plot_exits_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = exit, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_exits, aes(x = archive_version_year, y = median_exits),
            color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Exits",
       title = "Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_exits_interactive_plotly <- ggplotly(plot_exits_interactive)

plot_exits_static <- ggplot(medians_exits, aes(x = archive_version_year, y = median_exits)) +
  geom_line(color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Exits",
       title = "Median Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# interactive with all + median
plot_births_interactive_plotly

plot_deaths_interactive_plotly

plot_entries_interactive_plotly

plot_exits_interactive_plotly

# static with median
plot_births_static

plot_deaths_static

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_static

plot_exits_static

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# static with all + median
plot_births_interactive

plot_deaths_interactive

## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_interactive

plot_exits_interactive

## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

number of critical and non critical business’s business events by risk level over year

original

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all.csv")
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

births <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_births.csv") %>%
  filter(archive_version_year != 2012)
births$avg_eal_tercile <- factor(births$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
births$pop_tercile <- factor(births$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

entries <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_entries.csv")%>%
  filter(archive_version_year != 2012)
entries$avg_eal_tercile <- factor(entries$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
entries$pop_tercile <- factor(entries$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

deaths <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_deaths.csv")%>%
  filter(archive_version_year != 2022)
deaths$avg_eal_tercile <- factor(deaths$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
deaths$pop_tercile <- factor(deaths$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

exits <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_exits.csv")%>%
  filter(archive_version_year != 2022)
exits$avg_eal_tercile <- factor(exits$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
exits$pop_tercile <- factor(exits$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))


plot1 <- all %>%
  ggplot()+
  geom_line(aes(archive_version_year,row_count,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by population tercile, all")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/critical_by_pop.png")

plot2 <- births %>%ggplot()+
  geom_line(aes(archive_version_year,births,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, births")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_births.png")


plot3 <- deaths %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, deaths")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_deaths.png")

plot4 <- entries %>% ggplot()+
  geom_line(aes(archive_version_year,entries,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, entries")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_entries.png")

plot5 <- exits %>% ggplot()+
  geom_line(aes(archive_version_year,exits,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, exits")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_exits.png")

plot1

plot2

plot3

plot4

plot5

new (just overall for now, i will add each business events)

lineplot version

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all.csv") %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))

## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

all <- all %>%
  mutate(critical = recode(critical,
                           "Not" = "Non-Critical",
                           "Critical" = "Critical"))

plot1 <- all %>%
  ggplot(aes(x = archive_version_year, y = row_count, color = critical)) +
  geom_line(size = 0.6, alpha = 0.8) +
  facet_grid(pop_tercile ~ avg_eal_tercile,
             labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
                                                     "Moderate" = "Moderate Risk",
                                                     "Low" = "Low Risk"))) +
  labs(x = "Year",
       y = "Number of Business",
       title = "Business Counts by Population Tercile and Risk Level",
       color = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1

stacked barplot version

all <- all %>%
  mutate(avg_eal_tercile = factor(avg_eal_tercile, levels = c("High", "Moderate", "Low"),
                                  labels = c("High Risk", "Moderate Risk", "Low Risk")),
         pop_tercile = factor(pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low")))

plot1_stk <- all %>%
  ggplot(aes(x = archive_version_year, y = row_count, fill = critical)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year",
       y = "Number of Business",
       title = "Business Counts by Population Tercile and Risk Level",
       fill = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1_stk

Number of Business & Mean Year Established by risk level and population level over year

original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,rows,color=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Number of Businesses")
# ggsave("figures/number_of_businesses.png")
plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,mean_year_established,color=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Mean year established")
# ggsave("figures/mean_year_established.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>% 
  mutate(CBSAFP = as.character(CBSAFP)) %>% 
  filter(!is.na(avg_eal)) %>% 
  left_join(top100_cbsa, by = c("CBSAFP" = "GEOID")) %>%
  filter(!is.na(pop_tercile))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, 
                               levels = c("High", "Moderate", "Low"),
                               labels = c("High Risk", "Moderate Risk", "Low Risk"))
data$pop_tercile <- factor(data$pop_tercile, 
                           levels = c("Pop: Low", "Pop: Moderate", "Pop: High"),
                           labels = c("Low Population", "Moderate Population", "High Population"))

### Plot 1: Number of Businesses

medians_business <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_rows = median(rows, na.rm = TRUE), .groups = "drop")

# Combined Plot: All individual CBSA lines (thin gray) + median line (blue)
plot1_combined <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = rows, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_business, aes(x = archive_version_year, y = median_rows),
            color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Number of Businesses", 
       title = "Number of Businesses by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# Static Plot: Only the median line for Number of Businesses
plot1_median <- ggplot(medians_business, aes(x = archive_version_year, y = median_rows)) +
  geom_line(color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Median Number of Businesses",
       title = "Median Number of Businesses by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### Plot 2: Mean Year Established

medians_year <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_year = median(mean_year_established, na.rm = TRUE), .groups = "drop")

# Combined Plot: All individual CBSA lines (thin gray) + median line (red)
plot2_combined <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = mean_year_established, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_year, aes(x = archive_version_year, y = median_year),
            color = "red", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Mean Year Established", 
       title = "Mean Year Established by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# Static Plot: Only the median line for Mean Year Established
plot2_median <- ggplot(medians_year, aes(x = archive_version_year, y = median_year)) +
  geom_line(color = "red", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", y = "Median Mean Year Established",
       title = "Median Mean Year Established by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# combined plots
plot1_combined

plot2_combined

## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

# static median-only plots
plot1_median

plot2_median

# interactive version
# interactive_plot1 <- ggplotly(plot1_combined)
# interactive_plot2 <- ggplotly(plot2_combined)

Number of Business Involved in Each Business Events

Old

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")%>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Births")
# ggsave("figures/births.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Deaths")
# ggsave("figures/deaths.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Entries")
# ggsave("figures/entries.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Exits")
# ggsave("figures/exits.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot3) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot4) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

(will work on this)

Line Employees

(Need to understand what this is doing)

I dont know if this is business count because these look different from the business counts by risk level by population level from before

original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Births")
# ggsave("figures/births_emp.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Deaths")
# ggsave("figures/deaths_emp.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =3)+
  ggtitle("Entries - # of employees")
# ggsave("figures/entries_emp.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_wrap(~avg_eal_tercile,ncol =)+
  ggtitle("Exits")
# ggsave("figures/exits_emp.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot3) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

ggplotly(plot4) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
risk_labeller <- labeller(avg_eal_tercile = c("High" = "High Risk",
                                                "Moderate" = "Moderate Risk",
                                                "Low" = "Low Risk"))

### 1. Births

medians_births <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")

# Interactive plot: all individual lines (light gray) with median line (blue)
plot_births_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = births, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_births, aes(x = archive_version_year, y = median_births),
            color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Employees",
       title = "Employee Size Involved in Business Births by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot_births_interactive_plotly <- ggplotly(plot_births_interactive)

# Static plot: only the median line
plot_births_static <- ggplot(medians_births, aes(x = archive_version_year, y = median_births)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Employee Size",
       title = "Median Employee Size by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 2. Deaths

medians_deaths <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_deaths = median(deaths, na.rm = TRUE), .groups = "drop")

plot_deaths_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = deaths, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_deaths, aes(x = archive_version_year, y = median_deaths),
            color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Deaths",
       title = "Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# plot_deaths_interactive_plotly <- ggplotly(plot_deaths_interactive)

plot_deaths_static <- ggplot(medians_deaths, aes(x = archive_version_year, y = median_deaths)) +
  geom_line(color = "red", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Deaths",
       title = "Median Business Deaths by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 3. Entries

medians_entries <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_entries = median(entry, na.rm = TRUE), .groups = "drop")

plot_entries_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = entry, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_entries, aes(x = archive_version_year, y = median_entries),
            color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Entries",
       title = "Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# plot_entries_interactive_plotly <- ggplotly(plot_entries_interactive)

plot_entries_static <- ggplot(medians_entries, aes(x = archive_version_year, y = median_entries)) +
  geom_line(color = "green", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Entries",
       title = "Median Business Entries by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

### 4. Exits

medians_exits <- data %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_exits = median(exit, na.rm = TRUE), .groups = "drop")

plot_exits_interactive <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = exit, group = NAME),
            color = "grey", alpha = 0.3) +
  geom_line(data = medians_exits, aes(x = archive_version_year, y = median_exits),
            color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Number of Business Exits",
       title = "Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
# plot_exits_interactive_plotly <- ggplotly(plot_exits_interactive)

plot_exits_static <- ggplot(medians_exits, aes(x = archive_version_year, y = median_exits)) +
  geom_line(color = "purple", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
  labs(x = "Year", y = "Median Number of Business Exits",
       title = "Median Business Exits by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# interactive with all + median:
# plot_births_interactive_plotly
# plot_deaths_interactive_plotly
# plot_entries_interactive_plotly
# plot_exits_interactive_plotly

# static with median only
plot_births_static

plot_deaths_static

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_static

plot_exits_static

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

# static with all + median
plot_births_interactive

plot_deaths_interactive

## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

plot_entries_interactive

plot_exits_interactive

## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

number of employees involved in ciritcal and non critical business’s business events by risk level over year (not really)

original

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv")
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

births <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_births_employees.csv") %>%
  filter(archive_version_year != 2012)
births$avg_eal_tercile <- factor(births$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
births$pop_tercile <- factor(births$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

entries <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_entries_employees.csv")%>%
  filter(archive_version_year != 2012)
entries$avg_eal_tercile <- factor(entries$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
entries$pop_tercile <- factor(entries$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

deaths <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_deaths_employees.csv")%>%
  filter(archive_version_year != 2022)
deaths$avg_eal_tercile <- factor(deaths$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
deaths$pop_tercile <- factor(deaths$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

exits <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_exits_employees.csv")%>%
  filter(archive_version_year != 2022)
exits$avg_eal_tercile <- factor(exits$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
exits$pop_tercile <- factor(exits$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))


plot1 <- all %>%
  ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by population tercile, all - employees")
# ggsave("figures/critical_by_pop_emp.png")

plot2 <- births %>%ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, births, births - employees")
# ggsave("figures/naics_by_tercile_births_emp.png")


plot3 <- deaths %>% ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, deaths - employees")
# ggsave("figures/naics_by_tercile_deaths_emp.png")

plot4 <- entries %>% ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, entries - employees")
# ggsave("figures/naics_by_tercile_entries_emp.png")

plot5 <- exits %>% ggplot()+
  geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
  facet_grid(pop_tercile~critical)+
  ggtitle("Critical by tercile, exits - employees")
# ggsave("figures/naics_by_tercile_exit_emps.png")

plot1

plot2

plot3

plot4

plot5

new

lineplot

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv") %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))

## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, employees
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# 2. Factor levels for risk and population terciles
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))

# 3. Recode "Not" to "Non-Critical" for the legend
all <- all %>%
  mutate(critical = recode(critical,
                           "Not" = "Non-Critical",
                           "Critical" = "Critical"))

# Create the plot
plot1 <- all %>%
    ggplot(aes(x = archive_version_year, y = employees, color = critical)) +
  # Thinner line
  geom_line(size = 0.6, alpha = 0.8) +
  # Facet by population tercile (rows) and risk level (columns)
  facet_grid(pop_tercile ~ avg_eal_tercile,
             labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
                                                     "Moderate" = "Moderate Risk",
                                                     "Low" = "Low Risk"))) +
  labs(x = "Year",
       y = "Employee Size",
       title = "Employee Size by Population Tercile and Risk Level",
       color = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1

stackedbar plot

all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv") %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))

## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, employees
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

all <- all %>%
  mutate(avg_eal_tercile = factor(avg_eal_tercile, levels = c("High", "Moderate", "Low"),
                                  labels = c("High Risk", "Moderate Risk", "Low Risk")),
         pop_tercile = factor(pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low")))

# Create the stacked bar plot:
plot1_stk <- all %>%
  ggplot(aes(x = archive_version_year, y = employees, fill = critical)) +
  geom_bar(stat = "identity", position = "stack") +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year",
       y = "Employee Size",
       title = "Employee Size by Population Tercile and Risk Level",
       fill = "Critical Status") +
  theme_minimal() +
  theme(legend.position = "top")

plot1_stk

Mean employee size by risk level and population level

original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,mean_employees,color=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Mean Employees")
# ggsave("figures/number_of_employees.png")

ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))

new

median mean employee size by population level

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

median_meanemployee <- data %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile)) %>%
  group_by(archive_version_year, pop_tercile) %>%
  summarise(median_meanemployee = median(mean_employees, na.rm = TRUE), .groups = "drop")

plot1 <- ggplot(median_meanemployee, aes(x = archive_version_year, y = median_meanemployee)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~pop_tercile, ncol = 3, labeller = labeller(pop_tercile = c("Pop: High" = "High Population", "Pop: Moderate" = "Moderate Population", "Pop: Low" = "Low Population"))) +
  labs(x = "Year", y = "Median Mean Employee Size",
       title = "Median Mean Employee Size by Population Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot1

median mean employee size by risk level

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

median_meanemployee <- data %>%
  filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile)) %>%
  group_by(archive_version_year, avg_eal_tercile) %>%
  summarise(median_meanemployee = median(mean_employees, na.rm = TRUE), .groups = "drop")

plot2 <- ggplot(median_meanemployee, aes(x = archive_version_year, y = median_meanemployee)) +
  geom_line(color = "blue", size = 1.2) +
  facet_wrap(~avg_eal_tercile, ncol = 3, labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
                                                     "Moderate" = "Moderate Risk",
                                                     "Low" = "Low Risk"))) +
  labs(x = "Year", y = "Median Mean Employee Size",
       title = "Median Mean Employee Size by Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")
plot2

median mean employee size by risk and population level (all + median)

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>% 
  mutate(CBSAFP = as.character(CBSAFP)) %>% 
  filter(!is.na(avg_eal)) %>% 
  left_join(top100_cbsa, by = c("CBSAFP" = "GEOID")) %>%
  filter(!is.na(pop_tercile))

## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

medians_employees <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_employees = median(mean_employees, na.rm = TRUE), .groups = "drop")

# Create the combined static plot:
# - Individual CBSA lines in thin gray
# - Overlaid median line in blue
# - Facet by population (rows) and risk (columns) with custom labels
plot1_static <- ggplot() +
  geom_line(data = data, aes(x = archive_version_year, y = mean_employees, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_employees, aes(x = archive_version_year, y = median_employees),
            color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile,
             labeller = labeller(
               pop_tercile = c("Pop: Low" = "Low Population",
                               "Pop: Moderate" = "Moderate Population",
                               "Pop: High" = "High Population"),
               avg_eal_tercile = c("High" = "High Risk",
                                   "Moderate" = "Moderate Risk",
                                   "Low" = "Low Risk")
             )) +
  labs(x = "Year", 
       y = "Mean Employees", 
       title = "Mean Employees by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

plot1_static

Number of Business Events by Population and Risk Level

original

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")%>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))

plot1 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Births")
# ggsave("figures/births_emp.png")

plot2 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Deaths")
# ggsave("figures/deaths_emp.png")

plot3 <- data %>%
  ggplot()+
  geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Entries")
# ggsave("figures/entries_emp.png")

plot4 <- data %>% ggplot()+
  geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
  facet_grid(pop_tercile~avg_eal_tercile)+
  ggtitle("Exits")
# ggsave("figures/exits_emp.png")

ggplotly(plot1) %>% layout(showlegend = FALSE)

ggplotly(plot2) %>% layout(showlegend = FALSE)

ggplotly(plot3) %>% layout(showlegend = FALSE)

ggplotly(plot4) %>% layout(showlegend = FALSE)

new

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")%>%
  mutate(CBSAFP=as.character(CBSAFP)) %>%
  filter(!is.na(avg_eal)) %>%
  left_join(top100_cbsa,by=c("CBSAFP"="GEOID")) %>%
  filter(!is.na(pop_tercile))

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data$avg_eal_tercile <- factor(data$avg_eal_tercile, 
                               levels = c("High", "Moderate", "Low"),
                               labels = c("High Risk", "Moderate Risk", "Low Risk"))
data$pop_tercile <- factor(data$pop_tercile, 
                           levels = c("Pop: Low", "Pop: Moderate", "Pop: High"),
                           labels = c("Low Population", "Moderate Population", "High Population"))


medians_births <- data %>%
  group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
  summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")

plot_births <- ggplot() +
  geom_line(data = data, 
            aes(x = archive_version_year, y = births, group = NAME),
            color = "grey", size = 0.5, alpha = 0.5) +
  geom_line(data = medians_births, 
            aes(x = archive_version_year, y = median_births),
            color = "blue", size = 1.2) +
  facet_grid(pop_tercile ~ avg_eal_tercile) +
  labs(x = "Year", 
       y = "Employee Size",
       title = "Employee Size Involved in Business Births by Population and Risk Level") +
  theme_minimal() +
  theme(legend.position = "none")

# Create interactive plotly version with hover text
# plot_births_interactive <- ggplotly(plot_births, tooltip = "text")

# Display the plots (in your R session, you can view them)
# plot_births_interactive
plot_births

Chord Diagrams

nri_cbsa_all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/nri_cbsa_all.csv") %>%
  mutate(CBSAFP =as.character(CBSAFP))

## Rows: 940 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_all
## dbl (3): CBSAFP, avg_eal_all, tercile_all
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

regions <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/top100_regions.csv") %>%
  mutate(GEOID =as.character(GEOID)) %>%
  select(GEOID, Region)

## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, NAME, Region
## dbl (1): GEOID
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

top100_cbsa <- get_acs(geography = "cbsa", variables = c(pop="B01001_001"),output = "wide") %>%
  top_n(100,popE) %>%
  mutate(pop_tercile_num = ntile(popE, 3)) %>%
  mutate(pop_tercile=as_factor(pop_tercile_num)) %>%
  mutate(pop_tercile = factor(pop_tercile, levels = 1:3, labels = c("Pop: Low", "Pop: Moderate", "Pop: High"))) %>% 
  select(pop_tercile,pop_tercile_num,popE,GEOID) %>%
  left_join(regions)

## Getting data from the 2019-2023 5-year ACS

## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.

## Joining with `by = join_by(GEOID)`

top100_cbsa_new <- top100_cbsa %>%
  rename_with(~paste0(., "_new"), everything())
top100_cbsa_old <- top100_cbsa %>%
  rename_with(~paste0(., "_old"), everything())

moves <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/migration_map.csv") %>%
  select(CBSAFP_old,CBSAFP_new,NAME_old,NAME_new,row_count,avg_eal_tercile_old,avg_eal_tercile_new) %>%
  mutate(CBSAFP_old=as.character(CBSAFP_old)) %>%
  mutate(CBSAFP_new=as.character(CBSAFP_new)) %>%
  mutate(NAME_old=ifelse(is.na(avg_eal_tercile_old),"Outside",NAME_old)) %>%
  mutate(NAME_new=ifelse(is.na(avg_eal_tercile_new),"Outside",NAME_new)) %>%
  group_by(CBSAFP_old,CBSAFP_new,NAME_old,NAME_new,avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count)) %>%
  left_join(top100_cbsa_old, by=c("CBSAFP_old"="GEOID_old")) %>%
  left_join(top100_cbsa_new, by=c("CBSAFP_new"="GEOID_new")) %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new)) %>%
  mutate(Region_old=ifelse(is.na(Region_old),"Outside",Region_old)) %>%
  mutate(Region_new=ifelse(is.na(Region_new),"Outside",Region_new))

## Rows: 63970 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): fips_code, lead_county, avg_eal_tercile_old, NAME_old, avg_eal_ter...
## dbl (12): archive_version_year, row_count, CBSAFP_old, CBSAFP_new, avg_eal_o...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'CBSAFP_old', 'CBSAFP_new', 'NAME_old', 'NAME_new', 'avg_eal_tercile_old'. You can override using the `.groups` argument.

Moving between risk level

Chord Plot

data <- moves %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption="Risk level to Risk level")

Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate	Outside
High	0.1418442	0.0145098	0.0363402	0.0729597
Low	0.0149617	0.0161366	0.0323389	0.0577516
Moderate	0.0303752	0.0298987	0.0341958	0.0614735
Outside	0.0696075	0.0633796	0.0675453	0.2566818

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Risk level to Risk level", cex.main = 1)

# png(file="figures/risk_to_risk.png")
dev.off()

## null device 
##           1

data <- moves %>%
  filter(Region_old==Region_new) %>%
  filter(Region_old!="Outside") %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Moves within Region")

Moves within Region
avg_eal_tercile_old	High	Low	Moderate
High	0.5021631	0.0183028	0.0656683
Low	0.0214088	0.0354964	0.0905158
Moderate	0.0593086	0.0872989	0.1198373

chordDiagram(
  x=data, directional = 1, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow",
    grid.col = c(High="red",Moderate="yellow", Low="green"))
title("Moves within Region", cex.main = 1)
# png(file="figures/moves_within_region.png")
dev.off()

## null device 
##           1

data <- moves %>%
  filter(Region_old!=Region_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

table <- moves %>%
  filter(Region_old!=Region_new) %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Risk of Destinations outside top100")
table

Risk of Destinations outside top100
avg_eal_tercile_all	row_count
High	13209
Low	3834
Moderate	5771

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Moves outside Region")

Moves outside Region
avg_eal_tercile_old	High	Low	Moderate	Outside
High	0.0591552	0.0204144	0.0424992	0.1398169
Low	0.0199004	0.0161259	0.0238998	0.1092194
Moderate	0.0336171	0.0204947	0.0147928	0.1173948
Outside	0.1335528	0.1191776	0.1299390	NA

chordDiagram(
  x=data, directional = 1, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow",
    grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Moves outside Region", cex.main = 1)
# png(file="figures/moves_outside_region.png")
dev.off()

## null device 
##           1

Table

moves_table <- moves %>%
  group_by(avg_eal_tercile_old, avg_eal_tercile_new) %>%
  summarise(row_count = sum(row_count, na.rm = TRUE), .groups = "drop") %>%
  # Replace NA with "Outside" for both origin and destination
  mutate(
    avg_eal_tercile_old = ifelse(is.na(avg_eal_tercile_old), "Outside", avg_eal_tercile_old),
    avg_eal_tercile_new = ifelse(is.na(avg_eal_tercile_new), "Outside", avg_eal_tercile_new)
  )

total_moves <- sum(moves_table$row_count, na.rm = TRUE)

moves_table <- moves_table %>%
  mutate(pct = round(row_count / total_moves * 100, 1))

moves_wide <- moves_table %>%
  select(avg_eal_tercile_old, avg_eal_tercile_new, pct) %>%
  pivot_wider(names_from = avg_eal_tercile_new, values_from = pct, values_fill = 0)

moves_wide <- moves_wide %>%
  rename(`Origin Risk Level` = avg_eal_tercile_old)

desired_order <- c("High", "Moderate", "Low", "Outside")

moves_wide$`Origin Risk Level` <- factor(moves_wide$`Origin Risk Level`,
                                         levels = desired_order)
moves_wide <- moves_wide %>%
  select(`Origin Risk Level`, all_of(desired_order)) %>%
  arrange(`Origin Risk Level`)

moves_wide %>%
  kable(
    format = "html",
    digits = 1,
    caption = "Percentage of Business Moves by Risk Level (Origin → Destination) [%]"
  ) %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
  add_header_above(c(" " = 1, "Destination Risk Level" = 4))

Percentage of Business Moves by Risk Level (Origin → Destination) [%]
	Destination Risk Level
Origin Risk Level	High	Moderate	Low	Outside
High	14.2	3.6	1.5	7.3
Moderate	3.0	3.4	3.0	6.1
Low	1.5	3.2	1.6	5.8
Outside	7.0	6.8	6.3	25.7

This chunk produces the chord diagrams for critical services and non-critical services

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_crit.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Critical: Risk level to Risk level")

Critical: Risk level to Risk level
avg_eal_tercile_old	Low	Moderate	High	Outside
Low	0.0160731	0.0317129	0.0136222	0.0600542
Moderate	0.0289641	0.0325660	0.0281381	0.0618009
High	0.0120379	0.0328097	0.1314421	0.0702234
Outside	0.0668382	0.0712932	0.0700339	0.2723900

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Critical: Risk level to Risk level", cex.main = 1)

# png(file="figures/critical_risk_to_risk.png")
dev.off()

## null device 
##           1

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_non_crit.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Non-Critical: Risk level to Risk level")

Non-Critical: Risk level to Risk level
avg_eal_tercile_old	Low	Moderate	High	Outside
Low	0.0162345	0.0333048	0.0170285	0.0541986
Moderate	0.0313408	0.0367105	0.0338271	0.0609682
High	0.0183239	0.0417877	0.1578947	0.0771818
Outside	0.0580431	0.0617622	0.0689497	0.2324440

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Non-Critical: Risk level to Risk level", cex.main = 1)

# png(file="figures/non_critical_risk_to_risk.png")
dev.off()

## null device 
##           1

This chunk produces the chord diagrams for standalone and chains

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_chain.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Chain: Risk level to Risk level")

Chain: Risk level to Risk level
avg_eal_tercile_old	Low	Moderate	High	Outside
Low	0.0239119	0.0384862	0.0292970	0.0452547
Moderate	0.0373499	0.0460452	0.0461934	0.0478237
High	0.0328541	0.0570130	0.1558223	0.0509856
Outside	0.0636826	0.0644731	0.0606689	0.2001383

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Chain: Risk level to Risk level", cex.main = 1)

# png(file="figures/chain_risk_to_risk.png")
dev.off()

## null device 
##           1

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_standalone.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Standalone: Risk level to Risk level")

Standalone: Risk level to Risk level
avg_eal_tercile_old	Low	Moderate	High	Outside
Low	0.0145856	0.0311126	0.0121021	0.0602444
Moderate	0.0284123	0.0318321	0.0272199	0.0641963
High	0.0108505	0.0322164	0.1390559	0.0773431
Outside	0.0633192	0.0681581	0.0713906	0.2679610

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Standalone: Risk level to Risk level", cex.main = 1)
# png(file="figures/standalone_risk_to_risk.png")
dev.off()

## null device 
##           1

This chunk produces across population categories. Outside the top100 is excluded.

#within same population category
data <- moves %>%
  filter(pop_tercile_num_old==pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Same pop category: Risk level to Risk level")

Same pop category: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate	Outside
High	0.4685364	0.0425758	0.1014529	NA
Low	0.0436896	0.0349314	0.0807472	0.0006075
Moderate	0.0831266	0.0680909	0.0714322	0.0016706
Outside	NA	0.0004556	0.0026831	NA

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Same pop category: Risk level to Risk level", cex.main = 1)

# png(file="figures/samepop_risk_to_risk.png")

#lower pop to higher pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Towards higher pop: Risk level to Risk level")

Towards higher pop: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate	Outside
High	0.3667207	0.0683614	0.1417865	0.0037472
Low	0.0196476	0.0562082	0.0668422	0.0085072
Moderate	0.0282560	0.0941868	0.1438120	0.0019242

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Towards higher pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/low_to_high_risk_to_risk.png")

#higher pop to lower pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Toward lower pop: Risk level to Risk level")

Toward lower pop: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate
High	0.3675349	0.0159453	0.0325839
Low	0.0676439	0.0499158	0.1152818
Moderate	0.1219174	0.0842825	0.1305338
Outside	0.0036645	0.0092107	0.0014856

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Toward lower pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/high_to_low_risk_to_risk.png")

This chunk produces across population categories. Inclusive of top100.

#within same population category
data <- moves %>%
  filter(pop_tercile_num_old==pop_tercile_num_new|avg_eal_tercile_old=="Outside"|avg_eal_tercile_new=="Outside") %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Same pop category: Risk level to Risk level")

Same pop category: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate	Outside
High	0.0937832	0.0085221	0.0203070	0.0899833
Low	0.0087450	0.0069919	0.0161625	0.0712266
Moderate	0.0166388	0.0136292	0.0142980	0.0758170
Outside	0.0858489	0.0781679	0.0833055	0.3165729

table <- moves %>%
  filter(avg_eal_tercile_old=="Outside") %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Same pop category:Risk of Destinations outside top100")
table

Same pop category:Risk of Destinations outside top100
avg_eal_tercile_all	row_count
High	13625
Low	8181
Moderate	9435

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Same pop category: Risk level to Risk level", cex.main = 1)

# png(file="figures/samepop_risk_to_risk.png")

#lower pop to higher pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Towards higher pop: Risk level to Risk level")

Towards higher pop: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate	Outside
High	0.3667207	0.0683614	0.1417865	0.0037472
Low	0.0196476	0.0562082	0.0668422	0.0085072
Moderate	0.0282560	0.0941868	0.1438120	0.0019242

table <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Towards higher pop:Risk of Destinations outside top100")
table

Towards higher pop:Risk of Destinations outside top100
avg_eal_tercile_all	row_count
Moderate	140

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Towards higher pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/low_to_high_risk_to_risk.png")

#higher pop to lower pop
data <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
  group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
  summarise(row_count=sum(row_count))

## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Toward lower pop: Risk level to Risk level")

Toward lower pop: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate
High	0.3675349	0.0159453	0.0325839
Low	0.0676439	0.0499158	0.1152818
Moderate	0.1219174	0.0842825	0.1305338
Outside	0.0036645	0.0092107	0.0014856

table <- moves %>%
  filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
  filter(avg_eal_tercile_new=="Outside") %>%
  left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
  group_by(avg_eal_tercile_all) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  kable(caption="Towards lower pop:Risk of Destinations outside top100")
table

Towards lower pop:Risk of Destinations outside top100
avg_eal_tercile_all	row_count

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Toward lower pop: Risk level to Risk level", cex.main = 1)

# png(file="figures/high_to_low_risk_to_risk.png")

This chunk produces for same population moves, crit/non, chain/non-chain

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_crit_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Critical Same Population Moves: Risk level to Risk level")

Critical Same Population Moves: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate
High	0.4626455	0.0411587	0.0918508
Low	0.0439087	0.0399670	0.0880924
Moderate	0.0735173	0.0747090	0.0841507

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Critical Same Population Moves: Risk level to Risk level", cex.main = 1)

# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()

## null device 
##           1

#noncrit
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_non_crit_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Non-Critical Same Population Moves: Risk level to Risk level")

Non-Critical Same Population Moves: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate
High	0.4721569	0.0495238	0.1001681
Low	0.0476190	0.0302521	0.0770868
Moderate	0.0834734	0.0676751	0.0720448

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Non-Critical Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()

## null device 
##           1

#chain

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_chain_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Chain Same Population Moves: Risk level to Risk level")

Chain Same Population Moves: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate
High	0.3718961	0.0741097	0.1376323
Low	0.0666025	0.0381136	0.0694899
Moderate	0.1054860	0.0625602	0.0741097

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Chain Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()

## null device 
##           1

#standalone
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_standalone_same_pop.csv") %>%
  mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
  mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))

## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

data %>%
  transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
  pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
  kable(caption = "Standalone Same Population Moves: Risk level to Risk level")

Standalone Same Population Moves: Risk level to Risk level
avg_eal_tercile_old	High	Low	Moderate
High	0.5006490	0.0345652	0.0806749
Low	0.0381174	0.0347018	0.0879842
Moderate	0.0682424	0.0747319	0.0803334

chordDiagram(
  x=data,
  directional = 1,
  direction.type = c("diffHeight", "arrows"),
  link.arr.type = "big.arrow",
  grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Standalone Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()

## null device 
##           1

Bar Charts

This chunk produces bar graphs for all 100 CBSAs and corresponding entries/exits and births/deaths for all years of data

library(ggpattern)

## Warning: package 'ggpattern' was built under R version 4.4.1

data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

summary(data)

##  archive_version_year     CBSAFP          entry             exit       
##  Min.   :2013         Min.   :10420   Min.   :  1.00   Min.   :  1.00  
##  1st Qu.:2015         1st Qu.:19348   1st Qu.: 26.00   1st Qu.: 25.00  
##  Median :2018         Median :32060   Median : 45.00   Median : 44.00  
##  Mean   :2018         Mean   :30209   Mean   : 67.08   Mean   : 67.08  
##  3rd Qu.:2020         3rd Qu.:40080   3rd Qu.: 84.00   3rd Qu.: 79.00  
##  Max.   :2022         Max.   :49340   Max.   :561.00   Max.   :670.00  
##                                                        NA's   :100     
##      births           deaths           NAME              avg_eal     
##  Min.   :  1328   Min.   :  1128   Length:1000        Min.   :38.21  
##  1st Qu.:  3617   1st Qu.:  3416   Class :character   1st Qu.:65.60  
##  Median :  6166   Median :  5294   Mode  :character   Median :75.28  
##  Mean   : 12411   Mean   : 10836                      Mean   :77.28  
##  3rd Qu.: 13648   3rd Qu.: 11576                      3rd Qu.:94.11  
##  Max.   :171670   Max.   :139494                      Max.   :99.91  
##                   NA's   :100                                        
##     tercile     avg_eal_tercile   
##  Min.   :1.00   Length:1000       
##  1st Qu.:1.00   Class :character  
##  Median :2.00   Mode  :character  
##  Mean   :1.99                     
##  3rd Qu.:3.00                     
##  Max.   :3.00                     
##

data <- data %>% gather(type,row_count, c(entry,exit,births,deaths)) %>%
  group_by(NAME,avg_eal_tercile,tercile,type) %>%
  summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
  mutate(label= paste0(type,"-",avg_eal_tercile))

## `summarise()` has grouped output by 'NAME', 'avg_eal_tercile', 'tercile'. You
## can override using the `.groups` argument.

color_ramp <- c("births-Low" = "#00b100", "births-Moderate" = "#007e00", "births-High" = "#004b00",
          "deaths-Low" = "#c600c6", "deaths-Moderate" = "#9f009f", "deaths-High" = "#780078")
data %>%
  filter(type %in% c("births", "deaths")) %>%
  group_by(NAME) %>%
  mutate(sort = sum(row_count,na.rm = T)) %>%
  ungroup() %>%
  ggplot(aes(x = fct_reorder(NAME, sort), y = row_count, fill = label)) + 
  geom_bar(position = "dodge", stat = "identity")+
  coord_flip()+
  theme_minimal()+
  scale_fill_manual(values=color_ramp)+
  ggtitle("Births and Deaths - 2013-2022")+
  theme(legend.position = "bottom")

# ggsave("figures/bar_chart_births_deaths.png")

color_ramp <- c("entry-Low" = "#00b100", "entry-Moderate" = "#007e00", "entry-High" = "#004b00",
          "exit-Low" = "#c600c6", "exit-Moderate" = "#9f009f", "exit-High" = "#780078")
data %>% filter(type %in% c("entry","exit")) %>%
  group_by(NAME) %>%
  mutate(sort = sum(row_count,na.rm = T)) %>%
  ungroup() %>%
  ggplot(aes(fill = label, y = row_count, x = forcats::fct_reorder(NAME, sort))) + 
  geom_bar(position = "dodge", stat = "identity")+
  coord_flip()+
  theme_minimal()+
  scale_fill_manual(values=color_ramp)+
  ggtitle("Entries and Exits - 2013-2022")+
  theme(legend.position = "bottom")

# ggsave("figures/bar_chart_entries_exits.png")

library(ggrepel)  
# Load and preprocess data (using your original code)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv") %>%
  gather(type, row_count, c(entry, exit, births, deaths)) %>%
  group_by(NAME, avg_eal_tercile, tercile, type) %>%
  summarise(row_count = sum(row_count, na.rm = TRUE)) %>%
  mutate(label = paste0(type, "-", avg_eal_tercile))

## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'NAME', 'avg_eal_tercile', 'tercile'. You can override using the `.groups` argument.

# Pivot to get births/deaths terciles per CBSA
plot_data <- data %>%
  filter(type %in% c("births", "deaths")) %>%
  select(NAME, type, tercile) %>%
  pivot_wider(names_from = type, values_from = tercile) %>%
  mutate(
    # Convert terciles to numeric positions (Moderate = 0)
    x = case_when(
      births == 1 ~ -1,
      births == 2 ~ 0,
      births == 3 ~ 1
    ),
    y = case_when(
      deaths == 1 ~ -1,
      deaths == 2 ~ 0,
      deaths == 3 ~ 1
    )
  )

## Adding missing grouping variables: `avg_eal_tercile`

# Plot
ggplot(plot_data, aes(x = x, y = y)) +
  # Reference lines (center at Moderate)
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
  geom_hline(yintercept = 0, linetype = "dashed", color = "gray70") +
  # Points (smaller size = 2)
  geom_point(aes(color = avg_eal_tercile), size = 2, alpha = 0.8) +
  # Non-overlapping labels
  geom_text_repel(
    aes(label = NAME),
    size = 2.5,
    max.overlaps = 20,
    segment.size = 0.2
  ) +
  # Axis labels (discrete scale)
  scale_x_continuous(
    breaks = c(-1, 0, 1),
    labels = c("Low Births", "Moderate Births", "High Births")
  ) +
  scale_y_continuous(
    breaks = c(-1, 0, 1),
    labels = c("Low Deaths", "Moderate Deaths", "High Deaths")
  ) +
  # Color scheme (EAL terciles)
  scale_color_manual(
    values = c("Low" = "#00b100", "Moderate" = "#007e00", "High" = "#004b00")
  ) +
  labs(
    title = "Births vs. Deaths by CBSA (Tercile Comparison)",
    x = "",
    y = "",
    color = "EAL Tercile"
  ) +
  theme_minimal() +
  theme(
    legend.position = "bottom",
    panel.grid.major = element_blank()  # Remove gridlines for clarity
  )

## Warning: ggrepel: 100 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

Expansion Test

expansion_size <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_rawdata.csv") %>%
  mutate(growth_count=employee_size_location-lag_employees,
         growth_pct=growth_count/lag_employees)

## Rows: 100600 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): company, avg_eal_tercile, avg_eal_tercile_parent, parent_region
## dbl (13): archive_version_year, parent_number, fips_code, primary_naics_code...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

summary(expansion_size$growth_count)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     1.00     1.00     4.00    23.62    11.00 64975.00

expansion_size %>%
  filter(growth_count <= quantile(growth_count, 0.95)) %>%
  ggplot()+geom_histogram(aes(growth_count))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

summary(expansion_size$growth_pct)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     0.001     0.333     0.667     3.626     2.000 10999.000

expansion_size %>%
  filter(growth_pct <= quantile(growth_pct, 0.95)) %>%
  ggplot()+geom_histogram(aes(growth_pct))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

process_data <- function(file_path, caption, include_yearly_column = FALSE) {
  table <- read_csv(file_path) %>%
    mutate(
      avg_eal_tercile_parent = ifelse(is.na(avg_eal_tercile_parent), "Outside100", avg_eal_tercile_parent),
      Total = Low + Moderate + High,
      Low_pct = round(Low / Total, 2),
      Moderate_pct = round(Moderate / Total, 2),
      High_pct = round(High / Total, 2),
    )
  
  if (include_yearly_column) {
    table <- table %>%
      rename(year=archive_version_year) %>%
      select(year,avg_eal_tercile_parent, Low, Low_pct, Moderate, Moderate_pct, High, High_pct, Total)
  } else {
      table <- table %>%
      select(avg_eal_tercile_parent, Low, Low_pct, Moderate, Moderate_pct, High, High_pct, Total)
  }
  
  kable(table, caption = caption)
}



# All expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all.csv", "ALL Expansions, rows (hq), columns (branch)")

## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

ALL Expansions, rows (hq), columns (branch)
avg_eal_tercile_parent	Low	Low_pct	Moderate	Moderate_pct	High	High_pct	Total
High	3343	0.22	4861	0.32	6970	0.46	15174
Low	5331	0.40	4463	0.34	3412	0.26	13206
Moderate	2857	0.24	5810	0.50	3049	0.26	11716
Outside100	4421	0.31	5441	0.38	4497	0.31	14359

# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly.csv", "A<< Expansions yearly, rows (hq), columns (branch)", TRUE)

## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

A<< Expansions yearly, rows (hq), columns (branch)
year	avg_eal_tercile_parent	Low	Low_pct	Moderate	Moderate_pct	High	High_pct	Total
2013	High	678	0.22	1057	0.34	1356	0.44	3091
2013	Low	673	0.36	630	0.34	558	0.30	1861
2013	Moderate	570	0.26	986	0.45	657	0.30	2213
2013	Outside100	952	0.31	1114	0.36	1028	0.33	3094
2014	High	202	0.20	314	0.31	498	0.49	1014
2014	Low	237	0.43	177	0.32	138	0.25	552
2014	Moderate	205	0.23	453	0.51	237	0.26	895
2014	Outside100	272	0.31	325	0.37	273	0.31	870
2015	High	201	0.25	244	0.31	351	0.44	796
2015	Low	355	0.41	297	0.34	214	0.25	866
2015	Moderate	173	0.23	398	0.54	169	0.23	740
2015	Outside100	212	0.28	301	0.40	245	0.32	758
2016	High	1014	0.21	1502	0.31	2282	0.48	4798
2016	Low	1289	0.43	929	0.31	775	0.26	2993
2016	Moderate	849	0.24	1763	0.50	929	0.26	3541
2016	Outside100	1221	0.29	1622	0.38	1408	0.33	4251
2017	High	230	0.23	321	0.32	449	0.45	1000
2017	Low	313	0.46	217	0.32	154	0.23	684
2017	Moderate	172	0.23	390	0.53	176	0.24	738
2017	Outside100	309	0.32	359	0.38	288	0.30	956
2018	High	163	0.21	249	0.32	366	0.47	778
2018	Low	250	0.44	183	0.32	135	0.24	568
2018	Moderate	180	0.25	362	0.50	180	0.25	722
2018	Outside100	408	0.33	490	0.40	341	0.28	1239
2019	High	119	0.25	148	0.31	210	0.44	477
2019	Low	131	0.43	101	0.33	71	0.23	303
2019	Moderate	97	0.27	172	0.47	96	0.26	365
2019	Outside100	124	0.32	156	0.40	112	0.29	392
2020	High	107	0.21	173	0.33	238	0.46	518
2020	Low	155	0.40	126	0.33	104	0.27	385
2020	Moderate	79	0.25	158	0.50	80	0.25	317
2020	Outside100	120	0.32	153	0.40	106	0.28	379
2021	High	121	0.25	152	0.31	217	0.44	490
2021	Low	210	0.40	149	0.28	170	0.32	529
2021	Moderate	83	0.28	158	0.54	54	0.18	295
2021	Outside100	114	0.28	148	0.36	145	0.36	407
2022	High	508	0.23	701	0.32	1003	0.45	2212
2022	Low	1718	0.38	1654	0.37	1093	0.24	4465
2022	Moderate	449	0.24	970	0.51	471	0.25	1890
2022	Outside100	689	0.34	773	0.38	551	0.27	2013

# All expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all_10.csv", "Expansions, rows (hq), columns (branch)")

## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Expansions, rows (hq), columns (branch)
avg_eal_tercile_parent	Low	Low_pct	Moderate	Moderate_pct	High	High_pct	Total
High	979	0.21	1424	0.31	2227	0.48	4630
Low	1383	0.47	868	0.29	713	0.24	2964
Moderate	743	0.23	1718	0.53	783	0.24	3244
Outside100	1252	0.31	1530	0.38	1253	0.31	4035

# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly_10.csv", "Expansions yearly, rows (hq), columns (branch)", TRUE)

## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Expansions yearly, rows (hq), columns (branch)
year	avg_eal_tercile_parent	Low	Low_pct	Moderate	Moderate_pct	High	High_pct	Total
2013	High	225	0.22	346	0.33	466	0.45	1037
2013	Low	211	0.39	159	0.29	169	0.31	539
2013	Moderate	150	0.24	287	0.46	189	0.30	626
2013	Outside100	323	0.30	374	0.35	384	0.36	1081
2014	High	49	0.19	77	0.30	134	0.52	260
2014	Low	71	0.44	55	0.34	37	0.23	163
2014	Moderate	41	0.19	128	0.60	43	0.20	212
2014	Outside100	73	0.28	93	0.36	92	0.36	258
2015	High	47	0.20	68	0.30	115	0.50	230
2015	Low	70	0.42	50	0.30	48	0.29	168
2015	Moderate	49	0.25	96	0.49	51	0.26	196
2015	Outside100	58	0.26	88	0.39	77	0.35	223
2016	High	269	0.21	401	0.31	612	0.48	1282
2016	Low	303	0.47	190	0.30	151	0.23	644
2016	Moderate	214	0.24	461	0.52	219	0.24	894
2016	Outside100	255	0.30	334	0.40	251	0.30	840
2017	High	84	0.21	130	0.32	191	0.47	405
2017	Low	119	0.49	72	0.30	53	0.22	244
2017	Moderate	47	0.22	136	0.63	32	0.15	215
2017	Outside100	129	0.36	128	0.36	103	0.29	360
2018	High	60	0.19	93	0.30	161	0.51	314
2018	Low	104	0.50	60	0.29	45	0.22	209
2018	Moderate	56	0.21	140	0.53	67	0.25	263
2018	Outside100	110	0.34	133	0.41	82	0.25	325
2019	High	34	0.24	39	0.27	69	0.49	142
2019	Low	59	0.46	41	0.32	28	0.22	128
2019	Moderate	30	0.27	56	0.51	24	0.22	110
2019	Outside100	49	0.30	70	0.43	45	0.27	164
2020	High	24	0.16	40	0.27	83	0.56	147
2020	Low	49	0.49	35	0.35	16	0.16	100
2020	Moderate	21	0.23	55	0.60	16	0.17	92
2020	Outside100	36	0.29	60	0.48	28	0.23	124
2021	High	43	0.30	33	0.23	65	0.46	141
2021	Low	45	0.46	26	0.27	27	0.28	98
2021	Moderate	23	0.25	56	0.60	14	0.15	93
2021	Outside100	32	0.29	36	0.33	41	0.38	109
2022	High	144	0.21	197	0.29	331	0.49	672
2022	Low	352	0.52	180	0.27	139	0.21	671
2022	Moderate	112	0.21	303	0.56	128	0.24	543
2022	Outside100	187	0.34	214	0.39	150	0.27	551

process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all_20.csv", "Expansions, rows (hq), columns (branch)")

## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Expansions, rows (hq), columns (branch)
avg_eal_tercile_parent	Low	Low_pct	Moderate	Moderate_pct	High	High_pct	Total
High	529	0.20	819	0.31	1337	0.50	2685
Low	886	0.49	522	0.29	409	0.23	1817
Moderate	474	0.23	1127	0.54	480	0.23	2081
Outside100	709	0.29	1006	0.41	723	0.30	2438

# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly_20.csv", "Expansions yearly, rows (hq), columns (branch)", TRUE)

## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Expansions yearly, rows (hq), columns (branch)
year	avg_eal_tercile_parent	Low	Low_pct	Moderate	Moderate_pct	High	High_pct	Total
2013	High	121	0.20	200	0.33	281	0.47	602
2013	Low	115	0.41	78	0.28	90	0.32	283
2013	Moderate	74	0.22	159	0.48	99	0.30	332
2013	Outside100	152	0.26	242	0.41	194	0.33	588
2014	High	25	0.15	50	0.30	92	0.55	167
2014	Low	53	0.49	37	0.34	19	0.17	109
2014	Moderate	25	0.18	87	0.63	26	0.19	138
2014	Outside100	51	0.28	74	0.41	57	0.31	182
2015	High	20	0.14	44	0.31	76	0.54	140
2015	Low	47	0.44	36	0.34	24	0.22	107
2015	Moderate	26	0.22	60	0.51	32	0.27	118
2015	Outside100	33	0.23	62	0.44	47	0.33	142
2016	High	140	0.20	220	0.32	327	0.48	687
2016	Low	185	0.45	122	0.29	108	0.26	415
2016	Moderate	160	0.27	302	0.51	130	0.22	592
2016	Outside100	148	0.27	231	0.43	163	0.30	542
2017	High	53	0.20	83	0.32	127	0.48	263
2017	Low	86	0.52	51	0.31	27	0.16	164
2017	Moderate	28	0.20	93	0.65	22	0.15	143
2017	Outside100	71	0.34	77	0.37	59	0.29	207
2018	High	42	0.20	60	0.29	104	0.50	206
2018	Low	82	0.49	51	0.31	33	0.20	166
2018	Moderate	42	0.21	106	0.54	49	0.25	197
2018	Outside100	63	0.34	73	0.39	49	0.26	185
2019	High	23	0.27	24	0.28	39	0.45	86
2019	Low	45	0.49	29	0.32	18	0.20	92
2019	Moderate	19	0.28	34	0.50	15	0.22	68
2019	Outside100	37	0.33	47	0.42	27	0.24	111
2020	High	14	0.15	20	0.22	58	0.63	92
2020	Low	31	0.49	25	0.40	7	0.11	63
2020	Moderate	13	0.23	32	0.56	12	0.21	57
2020	Outside100	25	0.31	37	0.46	19	0.23	81
2021	High	17	0.24	14	0.20	40	0.56	71
2021	Low	26	0.57	7	0.15	13	0.28	46
2021	Moderate	17	0.27	36	0.56	11	0.17	64
2021	Outside100	18	0.33	17	0.31	20	0.36	55
2022	High	74	0.20	104	0.28	193	0.52	371
2022	Low	216	0.58	86	0.23	70	0.19	372
2022	Moderate	70	0.19	218	0.59	84	0.23	372
2022	Outside100	111	0.32	146	0.42	88	0.26	345

Figures

Descriptive States

Census

T-tests 3 is highest risk, 2 is moderate, 1 is low

Line Graphs

Number of Business & Mean Establishment Year by risk level over year

Original

New

Business Events by risk level over year

Origianl

New

number of critical and non critical business’s business events by risk level over year

original

new (just overall for now, i will add each business events)

lineplot version

stacked barplot version

Number of Business & Mean Year Established by risk level and population level over year

original

new

Number of Business Involved in Each Business Events

Old

new

Line Employees

(Need to understand what this is doing)

original

new

number of employees involved in ciritcal and non critical business’s business events by risk level over year (not really)

original

new

lineplot

stackedbar plot

Mean employee size by risk level and population level

original

new

median mean employee size by population level

median mean employee size by risk level

median mean employee size by risk and population level (all + median)

Number of Business Events by Population and Risk Level

original

new

Chord Diagrams

Moving between risk level

Chord Plot

Table

This chunk produces the chord diagrams for critical services and non-critical services

Bar Charts

Expansion Test