data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv")
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
table1 <- data %>%
mutate(outside_top_100=ifelse(is.na(avg_eal_tercile),"outside_top_100","within_top_100")) %>%
group_by(archive_version_year,outside_top_100) %>%
summarise(rows=sum(rows,na.rm = TRUE)) %>%
pivot_wider(id_cols = archive_version_year,names_from = outside_top_100,values_from = rows) %>%
mutate(outside_prop=outside_top_100/(outside_top_100+within_top_100),
inside_prop=within_top_100/(outside_top_100+within_top_100))
## `summarise()` has grouped output by 'archive_version_year'. You can override
## using the `.groups` argument.
kable(table1,caption="Share of establishments in/out top 100 MSAs")
| archive_version_year | outside_top_100 | within_top_100 | outside_prop | inside_prop |
|---|---|---|---|---|
| 2013 | 5516520 | 10437838 | 0.3457688 | 0.6542312 |
| 2014 | 5573021 | 10343882 | 0.3501322 | 0.6498678 |
| 2015 | 5463901 | 10165015 | 0.3496020 | 0.6503980 |
| 2016 | 5423220 | 10293413 | 0.3450625 | 0.6549375 |
| 2017 | 5124766 | 9702386 | 0.3456339 | 0.6543661 |
| 2018 | 5050960 | 9648466 | 0.3436161 | 0.6563839 |
| 2019 | 5151613 | 10345141 | 0.3324317 | 0.6675683 |
| 2020 | 5139349 | 10385438 | 0.3310415 | 0.6689585 |
| 2021 | 5247277 | 10705948 | 0.3289164 | 0.6710836 |
| 2022 | 5472590 | 11401758 | 0.3243142 | 0.6756858 |
moves <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/migration_map.csv") %>%
mutate(old_outside_top_100=ifelse(is.na(avg_eal_tercile_old),"outside_top_100","within_top_100")) %>%
mutate(new_outside_top_100=ifelse(is.na(avg_eal_tercile_new),"outside_top_100","within_top_100")) %>%
mutate(type=paste0(old_outside_top_100,"_to_",new_outside_top_100)) %>%
group_by(archive_version_year,type) %>%
summarise(moves=sum(row_count,na.rm = TRUE)) %>%
pivot_wider(id_cols = archive_version_year,names_from = type,values_from = moves) %>%
rowwise() %>%
mutate(total = sum(c_across(-1), na.rm = TRUE))
## Rows: 63970 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): fips_code, lead_county, avg_eal_tercile_old, NAME_old, avg_eal_ter...
## dbl (12): archive_version_year, row_count, CBSAFP_old, CBSAFP_new, avg_eal_o...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'archive_version_year'. You can override using the `.groups` argument.
kable(moves,caption="Moves in.out top 100 MSAs")
| archive_version_year | outside_top_100_to_outside_top_100 | outside_top_100_to_within_top_100 | within_top_100_to_outside_top_100 | within_top_100_to_within_top_100 | total |
|---|---|---|---|---|---|
| 2012 | 3317 | 2218 | 2054 | 3635 | 7907 |
| 2013 | 3422 | 2435 | 2252 | 3963 | 8650 |
| 2014 | 2876 | 1899 | 1868 | 3078 | 6845 |
| 2015 | 2415 | 1740 | 1549 | 2642 | 5931 |
| 2016 | 3372 | 2437 | 2190 | 3948 | 8575 |
| 2017 | 2904 | 2411 | 2248 | 3944 | 8603 |
| 2018 | 4045 | 3731 | 3342 | 6559 | 13632 |
| 2019 | 3412 | 2721 | 2812 | 4895 | 10428 |
| 2020 | 2525 | 2143 | 2195 | 4230 | 8568 |
| 2021 | 2953 | 2672 | 2881 | 5778 | 11331 |
moves_prop <- moves %>%
mutate(prop_outside_top_100_to_outside_top_100=outside_top_100_to_outside_top_100/total,
prop_outside_top_100_to_within_top_100=outside_top_100_to_within_top_100/total,
prop_within_top_100_to_outside_top_100=within_top_100_to_outside_top_100/total,
prop_within_top_100_to_within_top_100=within_top_100_to_within_top_100/total) %>%
select(-c(2:5))
kable(moves,caption="Shares of moves in.out top 100 MSAs")
| archive_version_year | outside_top_100_to_outside_top_100 | outside_top_100_to_within_top_100 | within_top_100_to_outside_top_100 | within_top_100_to_within_top_100 | total |
|---|---|---|---|---|---|
| 2012 | 3317 | 2218 | 2054 | 3635 | 7907 |
| 2013 | 3422 | 2435 | 2252 | 3963 | 8650 |
| 2014 | 2876 | 1899 | 1868 | 3078 | 6845 |
| 2015 | 2415 | 1740 | 1549 | 2642 | 5931 |
| 2016 | 3372 | 2437 | 2190 | 3948 | 8575 |
| 2017 | 2904 | 2411 | 2248 | 3944 | 8603 |
| 2018 | 4045 | 3731 | 3342 | 6559 | 13632 |
| 2019 | 3412 | 2721 | 2812 | 4895 | 10428 |
| 2020 | 2525 | 2143 | 2195 | 4230 | 8568 |
| 2021 | 2953 | 2672 | 2881 | 5778 | 11331 |
3 is highest risk, 2 is moderate, 1 is low
table <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/census.csv")
## Rows: 4 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile
## dbl (18): asian, avghhsize, bacplus, black, child, density, hispanic, medgro...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
table2 <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/industry.csv")
## Rows: 4 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile
## dbl (14): FIRE, agriculutre_forestry_etc, armed_forces, arts_ent_rec_etc, co...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
kable(table,caption="Census Variables by Risk Level")
| avg_eal_tercile | asian | avghhsize | bacplus | black | child | density | hispanic | medgrossrents | medhhinc | medianhhval | other | owner_occupied | poverty | senior | total | unemp | vacant | white |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0315712 | 2.492941 | 0.3567584 | 0.1567143 | 0.2055315 | 345.1562 | 0.1144124 | 1042.8824 | 68181.35 | 219141.2 | 0.0381176 | 0.6606767 | 0.1227416 | 0.1556827 | 1494376.3 | 0.0503188 | 0.0879593 | 0.6591846 |
| 2 | 0.0374453 | 2.589091 | 0.3574478 | 0.1343262 | 0.2111732 | 604.2491 | 0.1572416 | 1089.0303 | 71106.09 | 240212.1 | 0.0427232 | 0.6623240 | 0.1234514 | 0.1502595 | 2060857.9 | 0.0543446 | 0.0858594 | 0.6282638 |
| 3 | 0.0889935 | 2.749091 | 0.3387351 | 0.0946081 | 0.2056592 | 828.4572 | 0.2888781 | 1412.9091 | 75917.24 | 387818.2 | 0.0494946 | 0.6279769 | 0.1258272 | 0.1651189 | 3149811.3 | 0.0588635 | 0.1053852 | 0.4780258 |
| Outside | 0.0150771 | 2.527414 | 0.2405747 | 0.0833987 | 0.2033337 | 115.2304 | 0.1335524 | 854.9571 | 56550.76 | 176321.7 | 0.0440430 | 0.6892702 | 0.1554496 | 0.1803648 | 109954.8 | 0.0553571 | 0.1511438 | 0.7239288 |
kable(table2,caption="Census Industry by Risk Level")
| avg_eal_tercile | FIRE | agriculutre_forestry_etc | armed_forces | arts_ent_rec_etc | construction | education_healthcare_etc | information | manufacturing | other | professional_science_managment_etc | public_admin | retail | transportation_warehouse_utilities | wholesale |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0722513 | 0.0089891 | 0.0099165 | 0.0835972 | 0.0632696 | 0.2451436 | 0.0180693 | 0.0957320 | 0.0461523 | 0.1176257 | 0.0523191 | 0.1067635 | 0.0549619 | 0.0252088 |
| 2 | 0.0703715 | 0.0083459 | 0.0074881 | 0.0846070 | 0.0643641 | 0.2331948 | 0.0176900 | 0.1133960 | 0.0466104 | 0.1192654 | 0.0454758 | 0.1095589 | 0.0535857 | 0.0260463 |
| 3 | 0.0670700 | 0.0173628 | 0.0086841 | 0.1020863 | 0.0728896 | 0.2182436 | 0.0198586 | 0.0764031 | 0.0474106 | 0.1310278 | 0.0458772 | 0.1119322 | 0.0562385 | 0.0249155 |
| Outside | 0.0444278 | 0.0391243 | 0.0089660 | 0.0892926 | 0.0697175 | 0.2388585 | 0.0121106 | 0.1328069 | 0.0469522 | 0.0739920 | 0.0527425 | 0.1165147 | 0.0529105 | 0.0215839 |
library(rstatix)
##
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
##
## filter
table <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/census_cbsas.csv")
## Rows: 939 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile
## dbl (18): asian, avghhsize, black, hispanic, medgrossrents, medhhinc, median...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
names <- names(table)[2:19]
table_tidy <- table %>% pivot_longer(!avg_eal_tercile)
for (i in names) {
print(i)
x <- table_tidy %>%
filter(name==i) %>%
mutate(avg_eal_tercile=as.character(avg_eal_tercile)) %>%
select(-name)
pairwise_result <- x %>% pairwise_t_test(value~avg_eal_tercile, p.adjust.method = "bonferroni") %>%
mutate(variable=i) %>%
select(variable, everything()) %>%
select(-.y.)
print(pairwise_result)
}
## [1] "asian"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 asian 1 2 34 33 3.9 e- 1 ns 1 e+ 0 ns
## 2 asian 1 3 34 33 1.55e-16 **** 9.31e-16 ****
## 3 asian 2 3 33 33 1.57e-13 **** 9.45e-13 ****
## 4 asian 1 Outside 34 839 7.73e- 4 *** 4.64e- 3 **
## 5 asian 2 Outside 33 839 7.32e- 6 **** 4.39e- 5 ****
## 6 asian 3 Outside 33 839 3.22e-45 **** 1.93e-44 ****
## [1] "avghhsize"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 avghhsize 1 2 34 33 7.5 e-2 ns 4.5 e-1 ns
## 2 avghhsize 1 3 34 33 2.37e-6 **** 1.42e-5 ****
## 3 avghhsize 2 3 33 33 3.32e-3 ** 1.99e-2 *
## 4 avghhsize 1 Outside 34 839 3.72e-1 ns 1 e+0 ns
## 5 avghhsize 2 Outside 33 839 1.16e-1 ns 6.94e-1 ns
## 6 avghhsize 3 Outside 33 839 2.03e-8 **** 1.22e-7 ****
## [1] "black"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 black 1 2 34 33 0.462 ns 1 ns
## 2 black 1 3 34 33 0.0416 * 0.25 ns
## 3 black 2 3 33 33 0.196 ns 1 ns
## 4 black 1 Outside 34 839 0.000798 *** 0.00479 **
## 5 black 2 Outside 33 839 0.0214 * 0.129 ns
## 6 black 3 Outside 33 839 0.612 ns 1 ns
## [1] "hispanic"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 hispanic 1 2 34 33 0.354 ns 1 ns
## 2 hispanic 1 3 34 33 0.00017 *** 0.00102 **
## 3 hispanic 2 3 33 33 0.00479 ** 0.0287 *
## 4 hispanic 1 Outside 34 839 0.563 ns 1 ns
## 5 hispanic 2 Outside 33 839 0.48 ns 1 ns
## 6 hispanic 3 Outside 33 839 0.0000042 **** 0.0000252 ****
## [1] "medgrossrents"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 medgrossren… 1 2 34 33 3.75e- 1 ns 1 e+ 0 ns
## 2 medgrossren… 1 3 34 33 2.24e-12 **** 1.34e-11 ****
## 3 medgrossren… 2 3 33 33 9.51e-10 **** 5.71e- 9 ****
## 4 medgrossren… 1 Outsi… 34 839 5.40e- 7 **** 3.24e- 6 ****
## 5 medgrossren… 2 Outsi… 33 839 8.64e-10 **** 5.18e- 9 ****
## 6 medgrossren… 3 Outsi… 33 839 1.57e-44 **** 9.40e-44 ****
## [1] "medhhinc"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 medhhinc 1 2 34 33 3.35e- 1 ns 1 e+ 0 ns
## 2 medhhinc 1 3 34 33 1.09e- 2 * 6.54e- 2 ns
## 3 medhhinc 2 3 33 33 1.16e- 1 ns 6.94e- 1 ns
## 4 medhhinc 1 Outside 34 839 1.06e- 7 **** 6.39e- 7 ****
## 5 medhhinc 2 Outside 33 839 6.5 e-11 **** 3.90e-10 ****
## 6 medhhinc 3 Outside 33 839 6.88e-18 **** 4.13e-17 ****
## [1] "medianhhval"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 medianhhval 1 2 34 33 4.07e- 1 ns 1 e+ 0 ns
## 2 medianhhval 1 3 34 33 5.51e-11 **** 3.31e-10 ****
## 3 medianhhval 2 3 33 33 1.12e- 8 **** 6.74e- 8 ****
## 4 medianhhval 1 Outside 34 839 1.89e- 2 * 1.13e- 1 ns
## 5 medianhhval 2 Outside 33 839 5.64e- 4 *** 3.38e- 3 **
## 6 medianhhval 3 Outside 33 839 1.61e-28 **** 9.68e-28 ****
## [1] "owner_occupied"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 owner_occupied 1 2 34 33 9.14e-1 ns 1 e+0 ns
## 2 owner_occupied 1 3 34 33 3.16e-2 * 1.89e-1 ns
## 3 owner_occupied 2 3 33 33 2.5 e-2 * 1.5 e-1 ns
## 4 owner_occupied 1 Outsi… 34 839 8.68e-3 ** 5.21e-2 ns
## 5 owner_occupied 2 Outsi… 33 839 1.48e-2 * 8.85e-2 ns
## 6 owner_occupied 3 Outsi… 33 839 3.58e-8 **** 2.15e-7 ****
## [1] "poverty"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 poverty 1 2 34 33 0.963 ns 1 ns
## 2 poverty 1 3 34 33 0.839 ns 1 ns
## 3 poverty 2 3 33 33 0.877 ns 1 ns
## 4 poverty 1 Outside 34 839 0.00275 ** 0.0165 *
## 5 poverty 2 Outside 33 839 0.00388 ** 0.0233 *
## 6 poverty 3 Outside 33 839 0.00749 ** 0.0449 *
## [1] "total"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 total 1 2 34 33 8.36e- 3 ** 5.01e- 2 ns
## 2 total 1 3 34 33 2.91e-14 **** 1.74e-13 ****
## 3 total 2 3 33 33 5.50e- 7 **** 3.30e- 6 ****
## 4 total 1 Outside 34 839 1.02e-18 **** 6.14e-18 ****
## 5 total 2 Outside 33 839 2.02e-33 **** 1.21e-32 ****
## 6 total 3 Outside 33 839 1.69e-71 **** 1.01e-70 ****
## [1] "unemp"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 unemp 1 2 34 33 0.444 ns 1 ns
## 2 unemp 1 3 34 33 0.104 ns 0.626 ns
## 3 unemp 2 3 33 33 0.394 ns 1 ns
## 4 unemp 1 Outside 34 839 0.181 ns 1 ns
## 5 unemp 2 Outside 33 839 0.791 ns 1 ns
## 6 unemp 3 Outside 33 839 0.358 ns 1 ns
## [1] "vacant"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 vacant 1 2 34 33 0.907 ns 1 e+0 ns
## 2 vacant 1 3 34 33 0.333 ns 1 e+0 ns
## 3 vacant 2 3 33 33 0.282 ns 1 e+0 ns
## 4 vacant 1 Outside 34 839 0.00000109 **** 6.56e-6 ****
## 5 vacant 2 Outside 33 839 0.000000693 **** 4.16e-6 ****
## 6 vacant 3 Outside 33 839 0.000482 *** 2.89e-3 **
## [1] "white"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 white 1 2 34 33 5.38e- 1 ns 1 e+ 0 ns
## 2 white 1 3 34 33 3.22e- 4 *** 1.93e- 3 **
## 3 white 2 3 33 33 3.04e- 3 ** 1.82e- 2 *
## 4 white 1 Outside 34 839 7.18e- 2 ns 4.31e- 1 ns
## 5 white 2 Outside 33 839 8.8 e- 3 ** 5.28e- 2 ns
## 6 white 3 Outside 33 839 2.63e-11 **** 1.58e-10 ****
## [1] "other"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 other 1 2 34 33 0.721 ns 1 ns
## 2 other 1 3 34 33 0.377 ns 1 ns
## 3 other 2 3 33 33 0.602 ns 1 ns
## 4 other 1 Outside 34 839 0.52 ns 1 ns
## 5 other 2 Outside 33 839 0.888 ns 1 ns
## 6 other 3 Outside 33 839 0.56 ns 1 ns
## [1] "density"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 density 1 2 34 33 9.40e- 8 **** 5.64e- 7 ****
## 2 density 1 3 34 33 1.4 e-22 **** 8.39e-22 ****
## 3 density 2 3 33 33 4.34e- 6 **** 2.61e- 5 ****
## 4 density 1 Outside 34 839 4.39e-11 **** 2.63e-10 ****
## 5 density 2 Outside 33 839 1.74e-40 **** 1.05e-39 ****
## 6 density 3 Outside 33 839 9.06e-77 **** 5.43e-76 ****
## [1] "child"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 child 1 2 34 33 0.433 ns 1 ns
## 2 child 1 3 34 33 0.986 ns 1 ns
## 3 child 2 3 33 33 0.447 ns 1 ns
## 4 child 1 Outside 34 839 0.67 ns 1 ns
## 5 child 2 Outside 33 839 0.134 ns 0.804 ns
## 6 child 3 Outside 33 839 0.657 ns 1 ns
## [1] "senior"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 senior 1 2 34 33 0.584 ns 1 ns
## 2 senior 1 3 34 33 0.34 ns 1 ns
## 3 senior 2 3 33 33 0.136 ns 0.817 ns
## 4 senior 1 Outside 34 839 0.000513 *** 0.00308 **
## 5 senior 2 Outside 33 839 0.0000303 **** 0.000182 ***
## 6 senior 3 Outside 33 839 0.034 * 0.204 ns
## [1] "bacplus"
## # A tibble: 6 × 9
## variable group1 group2 n1 n2 p p.signif p.adj p.adj.signif
## <chr> <chr> <chr> <int> <int> <dbl> <chr> <dbl> <chr>
## 1 bacplus 1 2 34 33 9.74e- 1 ns 1 e+ 0 ns
## 2 bacplus 1 3 34 33 3.89e- 1 ns 1 e+ 0 ns
## 3 bacplus 2 3 33 33 3.74e- 1 ns 1 e+ 0 ns
## 4 bacplus 1 Outside 34 839 2.09e-14 **** 1.25e-13 ****
## 5 bacplus 2 Outside 33 839 3.39e-14 **** 2.03e-13 ****
## 6 bacplus 3 Outside 33 839 1.58e-10 **** 9.5 e-10 ****
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
filter(!is.na(avg_eal))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,rows,color=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Number of Businesses")
# ggsave("figures/number_of_businesses.png")
plot2 <- data %>% ggplot()+
geom_line(aes(archive_version_year,mean_year_established,color=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Mean year established")
# ggsave("figures/mean_year_established.png")
ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
library(ggplot2)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
filter(!is.na(avg_eal))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile,
levels = c("High", "Moderate", "Low"),
labels = c("High Risk", "Moderate Risk", "Low Risk"))
### Plot 1: Number of Businesses
medians_business <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_business = median(rows, na.rm = TRUE), .groups = "drop")
# interactive plot (all individual lines in light gray plus the median line)
plot1_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = rows, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_business, aes(x = archive_version_year, y = median_business),
color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3) +
labs(x = "Year", y = "Number of Businesses",
title = "Number of Businesses by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
plot1_interactive_plotly <- ggplotly(plot1_interactive)
# static plot with only the median line
plot1_static <- ggplot(medians_business, aes(x = archive_version_year, y = median_business)) +
geom_line(data = medians_business, aes(x = archive_version_year, y = median_business),
color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3) +
labs(x = "Year", y = "Median Number of Businesses",
title = "Median Number of Businesses by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# ggsave("figures/number_of_businesses_median.png", plot = plot1_static)
plot1_static_bar <- ggplot(medians_business, aes(x = archive_version_year, y = median_business)) +
geom_bar(stat = "identity", fill = "blue", alpha = 0.7) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = labeller(avg_eal_tercile = c("High" = "High Risk", "Moderate" = "Moderate Risk", "Low" = "Low Risk"))) +
labs(x = "Year", y = "Median Number of Businesses",
title = "Median Number of Businesses by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### Plot 2: Mean Year Established
medians_year <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_year = median(mean_year_established, na.rm = TRUE), .groups = "drop")
# interactive plot (all individual lines in light gray plus the median line)
plot2_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = mean_year_established, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_year, aes(x = archive_version_year, y = median_year),
color = "red", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3) +
labs(x = "Year", y = "Mean Year Established",
title = "Mean Year Established by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot2_interactive_plotly <- ggplotly(plot2_interactive)
# static plot with only the median line for Mean Year Established
plot2_static <- ggplot(medians_year, aes(x = archive_version_year, y = median_year)) +
geom_line(color = "red", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3) +
labs(x = "Year", y = "Median Mean Year Established",
title = "Median Mean Year Established by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# ggsave("figures/mean_year_established_median.png", plot = plot2_static)
plot1_interactive_plotly # interactive plot 1 will all + median
plot1_static # static plot 1 median only
plot1_interactive # static plot 1 with all + median
plot2_interactive_plotly # interactive plot 2 will all + median
plot2_static # static plot 2 with median only
plot2_interactive # static plot 2 with all + median
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Births")
# ggsave("figures/births.png")
plot2 <- data %>% ggplot()+
geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Deaths")
# ggsave("figures/deaths.png")
plot3 <- data %>%
ggplot()+
geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Entries")
# ggsave("figures/entries.png")
plot4 <- data %>% ggplot()+
geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =)+
ggtitle("Exits")
# ggsave("figures/exits.png")
ggplotly(plot1)
ggplotly(plot2)
ggplotly(plot3)
ggplotly(plot4)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
risk_labeller <- labeller(avg_eal_tercile = c("High" = "High Risk",
"Moderate" = "Moderate Risk",
"Low" = "Low Risk"))
### 1. Births
medians_births <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")
# Interactive plot: all individual lines (light gray) with median line (blue)
plot_births_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = births, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_births, aes(x = archive_version_year, y = median_births),
color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Births",
title = "Business Births by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot_births_interactive_plotly <- ggplotly(plot_births_interactive)
# Static plot: only the median line
plot_births_static <- ggplot(medians_births, aes(x = archive_version_year, y = median_births)) +
geom_line(color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Births",
title = "Median Business Births by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### 2. Deaths
medians_deaths <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_deaths = median(deaths, na.rm = TRUE), .groups = "drop")
plot_deaths_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = deaths, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_deaths, aes(x = archive_version_year, y = median_deaths),
color = "red", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Deaths",
title = "Business Deaths by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot_deaths_interactive_plotly <- ggplotly(plot_deaths_interactive)
plot_deaths_static <- ggplot(medians_deaths, aes(x = archive_version_year, y = median_deaths)) +
geom_line(color = "red", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Deaths",
title = "Median Business Deaths by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### 3. Entries
medians_entries <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_entries = median(entry, na.rm = TRUE), .groups = "drop")
plot_entries_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = entry, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_entries, aes(x = archive_version_year, y = median_entries),
color = "green", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Entries",
title = "Business Entries by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot_entries_interactive_plotly <- ggplotly(plot_entries_interactive)
plot_entries_static <- ggplot(medians_entries, aes(x = archive_version_year, y = median_entries)) +
geom_line(color = "green", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Entries",
title = "Median Business Entries by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### 4. Exits
medians_exits <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_exits = median(exit, na.rm = TRUE), .groups = "drop")
plot_exits_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = exit, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_exits, aes(x = archive_version_year, y = median_exits),
color = "purple", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Exits",
title = "Business Exits by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot_exits_interactive_plotly <- ggplotly(plot_exits_interactive)
plot_exits_static <- ggplot(medians_exits, aes(x = archive_version_year, y = median_exits)) +
geom_line(color = "purple", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Exits",
title = "Median Business Exits by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# interactive with all + median
plot_births_interactive_plotly
plot_deaths_interactive_plotly
plot_entries_interactive_plotly
plot_exits_interactive_plotly
# static with median
plot_births_static
plot_deaths_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
plot_entries_static
plot_exits_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
# static with all + median
plot_births_interactive
plot_deaths_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
plot_entries_interactive
plot_exits_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all.csv")
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
births <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_births.csv") %>%
filter(archive_version_year != 2012)
births$avg_eal_tercile <- factor(births$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
births$pop_tercile <- factor(births$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
entries <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_entries.csv")%>%
filter(archive_version_year != 2012)
entries$avg_eal_tercile <- factor(entries$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
entries$pop_tercile <- factor(entries$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
deaths <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_deaths.csv")%>%
filter(archive_version_year != 2022)
deaths$avg_eal_tercile <- factor(deaths$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
deaths$pop_tercile <- factor(deaths$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
exits <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_exits.csv")%>%
filter(archive_version_year != 2022)
exits$avg_eal_tercile <- factor(exits$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
exits$pop_tercile <- factor(exits$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
plot1 <- all %>%
ggplot()+
geom_line(aes(archive_version_year,row_count,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by population tercile, all")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/critical_by_pop.png")
plot2 <- births %>%ggplot()+
geom_line(aes(archive_version_year,births,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, births")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_births.png")
plot3 <- deaths %>% ggplot()+
geom_line(aes(archive_version_year,deaths,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, deaths")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_deaths.png")
plot4 <- entries %>% ggplot()+
geom_line(aes(archive_version_year,entries,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, entries")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_entries.png")
plot5 <- exits %>% ggplot()+
geom_line(aes(archive_version_year,exits,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, exits")
# ggsave("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/JoannaW/figures/naics_by_tercile_exits.png")
plot1
plot2
plot3
plot4
plot5
all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all.csv") %>%
filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))
## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
all <- all %>%
mutate(critical = recode(critical,
"Not" = "Non-Critical",
"Critical" = "Critical"))
plot1 <- all %>%
ggplot(aes(x = archive_version_year, y = row_count, color = critical)) +
geom_line(size = 0.6, alpha = 0.8) +
facet_grid(pop_tercile ~ avg_eal_tercile,
labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
"Moderate" = "Moderate Risk",
"Low" = "Low Risk"))) +
labs(x = "Year",
y = "Number of Business",
title = "Business Counts by Population Tercile and Risk Level",
color = "Critical Status") +
theme_minimal() +
theme(legend.position = "top")
plot1
all <- all %>%
mutate(avg_eal_tercile = factor(avg_eal_tercile, levels = c("High", "Moderate", "Low"),
labels = c("High Risk", "Moderate Risk", "Low Risk")),
pop_tercile = factor(pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low")))
plot1_stk <- all %>%
ggplot(aes(x = archive_version_year, y = row_count, fill = critical)) +
geom_bar(stat = "identity", position = "stack") +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year",
y = "Number of Business",
title = "Business Counts by Population Tercile and Risk Level",
fill = "Critical Status") +
theme_minimal() +
theme(legend.position = "top")
plot1_stk
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,rows,color=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Number of Businesses")
# ggsave("figures/number_of_businesses.png")
plot2 <- data %>% ggplot()+
geom_line(aes(archive_version_year,mean_year_established,color=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Mean year established")
# ggsave("figures/mean_year_established.png")
ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
mutate(CBSAFP = as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa, by = c("CBSAFP" = "GEOID")) %>%
filter(!is.na(pop_tercile))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile,
levels = c("High", "Moderate", "Low"),
labels = c("High Risk", "Moderate Risk", "Low Risk"))
data$pop_tercile <- factor(data$pop_tercile,
levels = c("Pop: Low", "Pop: Moderate", "Pop: High"),
labels = c("Low Population", "Moderate Population", "High Population"))
### Plot 1: Number of Businesses
medians_business <- data %>%
group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
summarise(median_rows = median(rows, na.rm = TRUE), .groups = "drop")
# Combined Plot: All individual CBSA lines (thin gray) + median line (blue)
plot1_combined <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = rows, group = NAME),
color = "grey", size = 0.5, alpha = 0.5) +
geom_line(data = medians_business, aes(x = archive_version_year, y = median_rows),
color = "blue", size = 1.2) +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year", y = "Number of Businesses",
title = "Number of Businesses by Population and Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# Static Plot: Only the median line for Number of Businesses
plot1_median <- ggplot(medians_business, aes(x = archive_version_year, y = median_rows)) +
geom_line(color = "blue", size = 1.2) +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year", y = "Median Number of Businesses",
title = "Median Number of Businesses by Population and Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### Plot 2: Mean Year Established
medians_year <- data %>%
group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
summarise(median_year = median(mean_year_established, na.rm = TRUE), .groups = "drop")
# Combined Plot: All individual CBSA lines (thin gray) + median line (red)
plot2_combined <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = mean_year_established, group = NAME),
color = "grey", size = 0.5, alpha = 0.5) +
geom_line(data = medians_year, aes(x = archive_version_year, y = median_year),
color = "red", size = 1.2) +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year", y = "Mean Year Established",
title = "Mean Year Established by Population and Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# Static Plot: Only the median line for Mean Year Established
plot2_median <- ggplot(medians_year, aes(x = archive_version_year, y = median_year)) +
geom_line(color = "red", size = 1.2) +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year", y = "Median Mean Year Established",
title = "Median Mean Year Established by Population and Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# combined plots
plot1_combined
plot2_combined
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
# static median-only plots
plot1_median
plot2_median
# interactive version
# interactive_plot1 <- ggplotly(plot1_combined)
# interactive_plot2 <- ggplotly(plot2_combined)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")%>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Births")
# ggsave("figures/births.png")
plot2 <- data %>% ggplot()+
geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Deaths")
# ggsave("figures/deaths.png")
plot3 <- data %>%
ggplot()+
geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Entries")
# ggsave("figures/entries.png")
plot4 <- data %>% ggplot()+
geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Exits")
# ggsave("figures/exits.png")
ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot3) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot4) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
(will work on this)
I dont know if this is business count because these look different from the business counts by risk level by population level from before
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Births")
# ggsave("figures/births_emp.png")
plot2 <- data %>% ggplot()+
geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Deaths")
# ggsave("figures/deaths_emp.png")
plot3 <- data %>%
ggplot()+
geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =3)+
ggtitle("Entries - # of employees")
# ggsave("figures/entries_emp.png")
plot4 <- data %>% ggplot()+
geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
facet_wrap(~avg_eal_tercile,ncol =)+
ggtitle("Exits")
# ggsave("figures/exits_emp.png")
ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot2) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot3) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
ggplotly(plot4) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
risk_labeller <- labeller(avg_eal_tercile = c("High" = "High Risk",
"Moderate" = "Moderate Risk",
"Low" = "Low Risk"))
### 1. Births
medians_births <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")
# Interactive plot: all individual lines (light gray) with median line (blue)
plot_births_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = births, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_births, aes(x = archive_version_year, y = median_births),
color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Employees",
title = "Employee Size Involved in Business Births by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot_births_interactive_plotly <- ggplotly(plot_births_interactive)
# Static plot: only the median line
plot_births_static <- ggplot(medians_births, aes(x = archive_version_year, y = median_births)) +
geom_line(color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Employee Size",
title = "Median Employee Size by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### 2. Deaths
medians_deaths <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_deaths = median(deaths, na.rm = TRUE), .groups = "drop")
plot_deaths_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = deaths, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_deaths, aes(x = archive_version_year, y = median_deaths),
color = "red", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Deaths",
title = "Business Deaths by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# plot_deaths_interactive_plotly <- ggplotly(plot_deaths_interactive)
plot_deaths_static <- ggplot(medians_deaths, aes(x = archive_version_year, y = median_deaths)) +
geom_line(color = "red", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Deaths",
title = "Median Business Deaths by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### 3. Entries
medians_entries <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_entries = median(entry, na.rm = TRUE), .groups = "drop")
plot_entries_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = entry, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_entries, aes(x = archive_version_year, y = median_entries),
color = "green", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Entries",
title = "Business Entries by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# plot_entries_interactive_plotly <- ggplotly(plot_entries_interactive)
plot_entries_static <- ggplot(medians_entries, aes(x = archive_version_year, y = median_entries)) +
geom_line(color = "green", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Entries",
title = "Median Business Entries by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
### 4. Exits
medians_exits <- data %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_exits = median(exit, na.rm = TRUE), .groups = "drop")
plot_exits_interactive <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = exit, group = NAME),
color = "grey", alpha = 0.3) +
geom_line(data = medians_exits, aes(x = archive_version_year, y = median_exits),
color = "purple", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Number of Business Exits",
title = "Business Exits by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# plot_exits_interactive_plotly <- ggplotly(plot_exits_interactive)
plot_exits_static <- ggplot(medians_exits, aes(x = archive_version_year, y = median_exits)) +
geom_line(color = "purple", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = risk_labeller) +
labs(x = "Year", y = "Median Number of Business Exits",
title = "Median Business Exits by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# interactive with all + median:
# plot_births_interactive_plotly
# plot_deaths_interactive_plotly
# plot_entries_interactive_plotly
# plot_exits_interactive_plotly
# static with median only
plot_births_static
plot_deaths_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
plot_entries_static
plot_exits_static
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
# static with all + median
plot_births_interactive
plot_deaths_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
plot_entries_interactive
plot_exits_interactive
## Warning: Removed 100 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv")
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
births <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_births_employees.csv") %>%
filter(archive_version_year != 2012)
births$avg_eal_tercile <- factor(births$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
births$pop_tercile <- factor(births$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
entries <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_entries_employees.csv")%>%
filter(archive_version_year != 2012)
entries$avg_eal_tercile <- factor(entries$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
entries$pop_tercile <- factor(entries$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
deaths <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_deaths_employees.csv")%>%
filter(archive_version_year != 2022)
deaths$avg_eal_tercile <- factor(deaths$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
deaths$pop_tercile <- factor(deaths$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
exits <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_exits_employees.csv")%>%
filter(archive_version_year != 2022)
exits$avg_eal_tercile <- factor(exits$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
exits$pop_tercile <- factor(exits$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
plot1 <- all %>%
ggplot()+
geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by population tercile, all - employees")
# ggsave("figures/critical_by_pop_emp.png")
plot2 <- births %>%ggplot()+
geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, births, births - employees")
# ggsave("figures/naics_by_tercile_births_emp.png")
plot3 <- deaths %>% ggplot()+
geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, deaths - employees")
# ggsave("figures/naics_by_tercile_deaths_emp.png")
plot4 <- entries %>% ggplot()+
geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, entries - employees")
# ggsave("figures/naics_by_tercile_entries_emp.png")
plot5 <- exits %>% ggplot()+
geom_line(aes(archive_version_year,employees,color=avg_eal_tercile))+
facet_grid(pop_tercile~critical)+
ggtitle("Critical by tercile, exits - employees")
# ggsave("figures/naics_by_tercile_exit_emps.png")
plot1
plot2
plot3
plot4
plot5
all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv") %>%
filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))
## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, employees
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# 2. Factor levels for risk and population terciles
all$avg_eal_tercile <- factor(all$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
all$pop_tercile <- factor(all$pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low"))
# 3. Recode "Not" to "Non-Critical" for the legend
all <- all %>%
mutate(critical = recode(critical,
"Not" = "Non-Critical",
"Critical" = "Critical"))
# Create the plot
plot1 <- all %>%
ggplot(aes(x = archive_version_year, y = employees, color = critical)) +
# Thinner line
geom_line(size = 0.6, alpha = 0.8) +
# Facet by population tercile (rows) and risk level (columns)
facet_grid(pop_tercile ~ avg_eal_tercile,
labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
"Moderate" = "Moderate Risk",
"Low" = "Low Risk"))) +
labs(x = "Year",
y = "Employee Size",
title = "Employee Size by Population Tercile and Risk Level",
color = "Critical Status") +
theme_minimal() +
theme(legend.position = "top")
plot1
all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/critical_pop_all_employees.csv") %>%
filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile))
## Rows: 220 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, avg_eal_tercile, critical
## dbl (2): archive_version_year, employees
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
all <- all %>%
mutate(avg_eal_tercile = factor(avg_eal_tercile, levels = c("High", "Moderate", "Low"),
labels = c("High Risk", "Moderate Risk", "Low Risk")),
pop_tercile = factor(pop_tercile, levels = c("Pop: High", "Pop: Moderate", "Pop: Low")))
# Create the stacked bar plot:
plot1_stk <- all %>%
ggplot(aes(x = archive_version_year, y = employees, fill = critical)) +
geom_bar(stat = "identity", position = "stack") +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year",
y = "Employee Size",
title = "Employee Size by Population Tercile and Risk Level",
fill = "Critical Status") +
theme_minimal() +
theme(legend.position = "top")
plot1_stk
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,mean_employees,color=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Mean Employees")
# ggsave("figures/number_of_employees.png")
ggplotly(plot1) %>% layout(showlegend = TRUE, legend = list(font = list(size = 6)))
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
median_meanemployee <- data %>%
filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile)) %>%
group_by(archive_version_year, pop_tercile) %>%
summarise(median_meanemployee = median(mean_employees, na.rm = TRUE), .groups = "drop")
plot1 <- ggplot(median_meanemployee, aes(x = archive_version_year, y = median_meanemployee)) +
geom_line(color = "blue", size = 1.2) +
facet_wrap(~pop_tercile, ncol = 3, labeller = labeller(pop_tercile = c("Pop: High" = "High Population", "Pop: Moderate" = "Moderate Population", "Pop: Low" = "Low Population"))) +
labs(x = "Year", y = "Median Mean Employee Size",
title = "Median Mean Employee Size by Population Level") +
theme_minimal() +
theme(legend.position = "none")
plot1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
median_meanemployee <- data %>%
filter(!is.na(avg_eal_tercile) & !is.na(pop_tercile)) %>%
group_by(archive_version_year, avg_eal_tercile) %>%
summarise(median_meanemployee = median(mean_employees, na.rm = TRUE), .groups = "drop")
plot2 <- ggplot(median_meanemployee, aes(x = archive_version_year, y = median_meanemployee)) +
geom_line(color = "blue", size = 1.2) +
facet_wrap(~avg_eal_tercile, ncol = 3, labeller = labeller(avg_eal_tercile = c("High" = "High Risk",
"Moderate" = "Moderate Risk",
"Low" = "Low Risk"))) +
labs(x = "Year", y = "Median Mean Employee Size",
title = "Median Mean Employee Size by Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot2
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/all_cbsa_summary.csv") %>%
mutate(CBSAFP = as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa, by = c("CBSAFP" = "GEOID")) %>%
filter(!is.na(pop_tercile))
## Rows: 9400 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile, NAME
## dbl (8): archive_version_year, CBSAFP, rows, with_parent_number, mean_employ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))
medians_employees <- data %>%
group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
summarise(median_employees = median(mean_employees, na.rm = TRUE), .groups = "drop")
# Create the combined static plot:
# - Individual CBSA lines in thin gray
# - Overlaid median line in blue
# - Facet by population (rows) and risk (columns) with custom labels
plot1_static <- ggplot() +
geom_line(data = data, aes(x = archive_version_year, y = mean_employees, group = NAME),
color = "grey", size = 0.5, alpha = 0.5) +
geom_line(data = medians_employees, aes(x = archive_version_year, y = median_employees),
color = "blue", size = 1.2) +
facet_grid(pop_tercile ~ avg_eal_tercile,
labeller = labeller(
pop_tercile = c("Pop: Low" = "Low Population",
"Pop: Moderate" = "Moderate Population",
"Pop: High" = "High Population"),
avg_eal_tercile = c("High" = "High Risk",
"Moderate" = "Moderate Risk",
"Low" = "Low Risk")
)) +
labs(x = "Year",
y = "Mean Employees",
title = "Mean Employees by Population and Risk Level") +
theme_minimal() +
theme(legend.position = "none")
plot1_static
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")%>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID"))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile, levels = c("High", "Moderate", "Low"))
data$pop_tercile <- factor(data$pop_tercile, levels = c("Pop: Low", "Pop: Moderate", "Pop: High"))
plot1 <- data %>% ggplot()+
geom_line(aes(archive_version_year,births,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Births")
# ggsave("figures/births_emp.png")
plot2 <- data %>% ggplot()+
geom_line(aes(archive_version_year,deaths,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Deaths")
# ggsave("figures/deaths_emp.png")
plot3 <- data %>%
ggplot()+
geom_line(aes(archive_version_year,entry,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Entries")
# ggsave("figures/entries_emp.png")
plot4 <- data %>% ggplot()+
geom_line(aes(archive_version_year,exit,color=NAME,group=NAME),alpha=0.5)+
facet_grid(pop_tercile~avg_eal_tercile)+
ggtitle("Exits")
# ggsave("figures/exits_emp.png")
ggplotly(plot1) %>% layout(showlegend = FALSE)
ggplotly(plot2) %>% layout(showlegend = FALSE)
ggplotly(plot3) %>% layout(showlegend = FALSE)
ggplotly(plot4) %>% layout(showlegend = FALSE)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys_employees.csv")%>%
mutate(CBSAFP=as.character(CBSAFP)) %>%
filter(!is.na(avg_eal)) %>%
left_join(top100_cbsa,by=c("CBSAFP"="GEOID")) %>%
filter(!is.na(pop_tercile))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$avg_eal_tercile <- factor(data$avg_eal_tercile,
levels = c("High", "Moderate", "Low"),
labels = c("High Risk", "Moderate Risk", "Low Risk"))
data$pop_tercile <- factor(data$pop_tercile,
levels = c("Pop: Low", "Pop: Moderate", "Pop: High"),
labels = c("Low Population", "Moderate Population", "High Population"))
medians_births <- data %>%
group_by(archive_version_year, pop_tercile, avg_eal_tercile) %>%
summarise(median_births = median(births, na.rm = TRUE), .groups = "drop")
plot_births <- ggplot() +
geom_line(data = data,
aes(x = archive_version_year, y = births, group = NAME),
color = "grey", size = 0.5, alpha = 0.5) +
geom_line(data = medians_births,
aes(x = archive_version_year, y = median_births),
color = "blue", size = 1.2) +
facet_grid(pop_tercile ~ avg_eal_tercile) +
labs(x = "Year",
y = "Employee Size",
title = "Employee Size Involved in Business Births by Population and Risk Level") +
theme_minimal() +
theme(legend.position = "none")
# Create interactive plotly version with hover text
# plot_births_interactive <- ggplotly(plot_births, tooltip = "text")
# Display the plots (in your R session, you can view them)
# plot_births_interactive
plot_births
nri_cbsa_all <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/nri_cbsa_all.csv") %>%
mutate(CBSAFP =as.character(CBSAFP))
## Rows: 940 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_all
## dbl (3): CBSAFP, avg_eal_all, tercile_all
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
regions <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/top100_regions.csv") %>%
mutate(GEOID =as.character(GEOID)) %>%
select(GEOID, Region)
## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): pop_tercile, NAME, Region
## dbl (1): GEOID
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
top100_cbsa <- get_acs(geography = "cbsa", variables = c(pop="B01001_001"),output = "wide") %>%
top_n(100,popE) %>%
mutate(pop_tercile_num = ntile(popE, 3)) %>%
mutate(pop_tercile=as_factor(pop_tercile_num)) %>%
mutate(pop_tercile = factor(pop_tercile, levels = 1:3, labels = c("Pop: Low", "Pop: Moderate", "Pop: High"))) %>%
select(pop_tercile,pop_tercile_num,popE,GEOID) %>%
left_join(regions)
## Getting data from the 2019-2023 5-year ACS
## Warning: • You have not set a Census API key. Users without a key are limited to 500
## queries per day and may experience performance limitations.
## ℹ For best results, get a Census API key at
## http://api.census.gov/data/key_signup.html and then supply the key to the
## `census_api_key()` function to use it throughout your tidycensus session.
## This warning is displayed once per session.
## Joining with `by = join_by(GEOID)`
top100_cbsa_new <- top100_cbsa %>%
rename_with(~paste0(., "_new"), everything())
top100_cbsa_old <- top100_cbsa %>%
rename_with(~paste0(., "_old"), everything())
moves <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/migration_map.csv") %>%
select(CBSAFP_old,CBSAFP_new,NAME_old,NAME_new,row_count,avg_eal_tercile_old,avg_eal_tercile_new) %>%
mutate(CBSAFP_old=as.character(CBSAFP_old)) %>%
mutate(CBSAFP_new=as.character(CBSAFP_new)) %>%
mutate(NAME_old=ifelse(is.na(avg_eal_tercile_old),"Outside",NAME_old)) %>%
mutate(NAME_new=ifelse(is.na(avg_eal_tercile_new),"Outside",NAME_new)) %>%
group_by(CBSAFP_old,CBSAFP_new,NAME_old,NAME_new,avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count)) %>%
left_join(top100_cbsa_old, by=c("CBSAFP_old"="GEOID_old")) %>%
left_join(top100_cbsa_new, by=c("CBSAFP_new"="GEOID_new")) %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new)) %>%
mutate(Region_old=ifelse(is.na(Region_old),"Outside",Region_old)) %>%
mutate(Region_new=ifelse(is.na(Region_new),"Outside",Region_new))
## Rows: 63970 Columns: 22
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): fips_code, lead_county, avg_eal_tercile_old, NAME_old, avg_eal_ter...
## dbl (12): archive_version_year, row_count, CBSAFP_old, CBSAFP_new, avg_eal_o...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'CBSAFP_old', 'CBSAFP_new', 'NAME_old', 'NAME_new', 'avg_eal_tercile_old'. You can override using the `.groups` argument.
data <- moves %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption="Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate | Outside |
|---|---|---|---|---|
| High | 0.1418442 | 0.0145098 | 0.0363402 | 0.0729597 |
| Low | 0.0149617 | 0.0161366 | 0.0323389 | 0.0577516 |
| Moderate | 0.0303752 | 0.0298987 | 0.0341958 | 0.0614735 |
| Outside | 0.0696075 | 0.0633796 | 0.0675453 | 0.2566818 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Risk level to Risk level", cex.main = 1)
# png(file="figures/risk_to_risk.png")
dev.off()
## null device
## 1
data <- moves %>%
filter(Region_old==Region_new) %>%
filter(Region_old!="Outside") %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Moves within Region")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.5021631 | 0.0183028 | 0.0656683 |
| Low | 0.0214088 | 0.0354964 | 0.0905158 |
| Moderate | 0.0593086 | 0.0872989 | 0.1198373 |
chordDiagram(
x=data, directional = 1, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green"))
title("Moves within Region", cex.main = 1)
# png(file="figures/moves_within_region.png")
dev.off()
## null device
## 1
data <- moves %>%
filter(Region_old!=Region_new) %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
table <- moves %>%
filter(Region_old!=Region_new) %>%
filter(avg_eal_tercile_new=="Outside") %>%
left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
group_by(avg_eal_tercile_all) %>%
summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
kable(caption="Risk of Destinations outside top100")
table
| avg_eal_tercile_all | row_count |
|---|---|
| High | 13209 |
| Low | 3834 |
| Moderate | 5771 |
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Moves outside Region")
| avg_eal_tercile_old | High | Low | Moderate | Outside |
|---|---|---|---|---|
| High | 0.0591552 | 0.0204144 | 0.0424992 | 0.1398169 |
| Low | 0.0199004 | 0.0161259 | 0.0238998 | 0.1092194 |
| Moderate | 0.0336171 | 0.0204947 | 0.0147928 | 0.1173948 |
| Outside | 0.1335528 | 0.1191776 | 0.1299390 | NA |
chordDiagram(
x=data, directional = 1, direction.type = c("diffHeight", "arrows"),link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Moves outside Region", cex.main = 1)
# png(file="figures/moves_outside_region.png")
dev.off()
## null device
## 1
moves_table <- moves %>%
group_by(avg_eal_tercile_old, avg_eal_tercile_new) %>%
summarise(row_count = sum(row_count, na.rm = TRUE), .groups = "drop") %>%
# Replace NA with "Outside" for both origin and destination
mutate(
avg_eal_tercile_old = ifelse(is.na(avg_eal_tercile_old), "Outside", avg_eal_tercile_old),
avg_eal_tercile_new = ifelse(is.na(avg_eal_tercile_new), "Outside", avg_eal_tercile_new)
)
total_moves <- sum(moves_table$row_count, na.rm = TRUE)
moves_table <- moves_table %>%
mutate(pct = round(row_count / total_moves * 100, 1))
moves_wide <- moves_table %>%
select(avg_eal_tercile_old, avg_eal_tercile_new, pct) %>%
pivot_wider(names_from = avg_eal_tercile_new, values_from = pct, values_fill = 0)
moves_wide <- moves_wide %>%
rename(`Origin Risk Level` = avg_eal_tercile_old)
desired_order <- c("High", "Moderate", "Low", "Outside")
moves_wide$`Origin Risk Level` <- factor(moves_wide$`Origin Risk Level`,
levels = desired_order)
moves_wide <- moves_wide %>%
select(`Origin Risk Level`, all_of(desired_order)) %>%
arrange(`Origin Risk Level`)
moves_wide %>%
kable(
format = "html",
digits = 1,
caption = "Percentage of Business Moves by Risk Level (Origin → Destination) [%]"
) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE) %>%
add_header_above(c(" " = 1, "Destination Risk Level" = 4))
| Origin Risk Level | High | Moderate | Low | Outside |
|---|---|---|---|---|
| High | 14.2 | 3.6 | 1.5 | 7.3 |
| Moderate | 3.0 | 3.4 | 3.0 | 6.1 |
| Low | 1.5 | 3.2 | 1.6 | 5.8 |
| Outside | 7.0 | 6.8 | 6.3 | 25.7 |
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_crit.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Critical: Risk level to Risk level")
| avg_eal_tercile_old | Low | Moderate | High | Outside |
|---|---|---|---|---|
| Low | 0.0160731 | 0.0317129 | 0.0136222 | 0.0600542 |
| Moderate | 0.0289641 | 0.0325660 | 0.0281381 | 0.0618009 |
| High | 0.0120379 | 0.0328097 | 0.1314421 | 0.0702234 |
| Outside | 0.0668382 | 0.0712932 | 0.0700339 | 0.2723900 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Critical: Risk level to Risk level", cex.main = 1)
# png(file="figures/critical_risk_to_risk.png")
dev.off()
## null device
## 1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_non_crit.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Non-Critical: Risk level to Risk level")
| avg_eal_tercile_old | Low | Moderate | High | Outside |
|---|---|---|---|---|
| Low | 0.0162345 | 0.0333048 | 0.0170285 | 0.0541986 |
| Moderate | 0.0313408 | 0.0367105 | 0.0338271 | 0.0609682 |
| High | 0.0183239 | 0.0417877 | 0.1578947 | 0.0771818 |
| Outside | 0.0580431 | 0.0617622 | 0.0689497 | 0.2324440 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Non-Critical: Risk level to Risk level", cex.main = 1)
# png(file="figures/non_critical_risk_to_risk.png")
dev.off()
## null device
## 1
This chunk produces the chord diagrams for standalone and chains
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_chain.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Chain: Risk level to Risk level")
| avg_eal_tercile_old | Low | Moderate | High | Outside |
|---|---|---|---|---|
| Low | 0.0239119 | 0.0384862 | 0.0292970 | 0.0452547 |
| Moderate | 0.0373499 | 0.0460452 | 0.0461934 | 0.0478237 |
| High | 0.0328541 | 0.0570130 | 0.1558223 | 0.0509856 |
| Outside | 0.0636826 | 0.0644731 | 0.0606689 | 0.2001383 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Chain: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_risk_to_risk.png")
dev.off()
## null device
## 1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_standalone.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 16 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Standalone: Risk level to Risk level")
| avg_eal_tercile_old | Low | Moderate | High | Outside |
|---|---|---|---|---|
| Low | 0.0145856 | 0.0311126 | 0.0121021 | 0.0602444 |
| Moderate | 0.0284123 | 0.0318321 | 0.0272199 | 0.0641963 |
| High | 0.0108505 | 0.0322164 | 0.1390559 | 0.0773431 |
| Outside | 0.0633192 | 0.0681581 | 0.0713906 | 0.2679610 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Standalone: Risk level to Risk level", cex.main = 1)
# png(file="figures/standalone_risk_to_risk.png")
dev.off()
## null device
## 1
This chunk produces across population categories. Outside the top100 is excluded.
#within same population category
data <- moves %>%
filter(pop_tercile_num_old==pop_tercile_num_new) %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Same pop category: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate | Outside |
|---|---|---|---|---|
| High | 0.4685364 | 0.0425758 | 0.1014529 | NA |
| Low | 0.0436896 | 0.0349314 | 0.0807472 | 0.0006075 |
| Moderate | 0.0831266 | 0.0680909 | 0.0714322 | 0.0016706 |
| Outside | NA | 0.0004556 | 0.0026831 | NA |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Same pop category: Risk level to Risk level", cex.main = 1)
# png(file="figures/samepop_risk_to_risk.png")
#lower pop to higher pop
data <- moves %>%
filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Towards higher pop: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate | Outside |
|---|---|---|---|---|
| High | 0.3667207 | 0.0683614 | 0.1417865 | 0.0037472 |
| Low | 0.0196476 | 0.0562082 | 0.0668422 | 0.0085072 |
| Moderate | 0.0282560 | 0.0941868 | 0.1438120 | 0.0019242 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Towards higher pop: Risk level to Risk level", cex.main = 1)
# png(file="figures/low_to_high_risk_to_risk.png")
#higher pop to lower pop
data <- moves %>%
filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Toward lower pop: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.3675349 | 0.0159453 | 0.0325839 |
| Low | 0.0676439 | 0.0499158 | 0.1152818 |
| Moderate | 0.1219174 | 0.0842825 | 0.1305338 |
| Outside | 0.0036645 | 0.0092107 | 0.0014856 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Toward lower pop: Risk level to Risk level", cex.main = 1)
# png(file="figures/high_to_low_risk_to_risk.png")
This chunk produces across population categories. Inclusive of top100.
#within same population category
data <- moves %>%
filter(pop_tercile_num_old==pop_tercile_num_new|avg_eal_tercile_old=="Outside"|avg_eal_tercile_new=="Outside") %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Same pop category: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate | Outside |
|---|---|---|---|---|
| High | 0.0937832 | 0.0085221 | 0.0203070 | 0.0899833 |
| Low | 0.0087450 | 0.0069919 | 0.0161625 | 0.0712266 |
| Moderate | 0.0166388 | 0.0136292 | 0.0142980 | 0.0758170 |
| Outside | 0.0858489 | 0.0781679 | 0.0833055 | 0.3165729 |
table <- moves %>%
filter(avg_eal_tercile_old=="Outside") %>%
filter(avg_eal_tercile_new=="Outside") %>%
left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
group_by(avg_eal_tercile_all) %>%
summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
kable(caption="Same pop category:Risk of Destinations outside top100")
table
| avg_eal_tercile_all | row_count |
|---|---|
| High | 13625 |
| Low | 8181 |
| Moderate | 9435 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Same pop category: Risk level to Risk level", cex.main = 1)
# png(file="figures/samepop_risk_to_risk.png")
#lower pop to higher pop
data <- moves %>%
filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Towards higher pop: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate | Outside |
|---|---|---|---|---|
| High | 0.3667207 | 0.0683614 | 0.1417865 | 0.0037472 |
| Low | 0.0196476 | 0.0562082 | 0.0668422 | 0.0085072 |
| Moderate | 0.0282560 | 0.0941868 | 0.1438120 | 0.0019242 |
table <- moves %>%
filter(!is.na(pop_tercile_old),pop_tercile_num_old>pop_tercile_num_new) %>%
filter(avg_eal_tercile_new=="Outside") %>%
left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
group_by(avg_eal_tercile_all) %>%
summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
kable(caption="Towards higher pop:Risk of Destinations outside top100")
table
| avg_eal_tercile_all | row_count |
|---|---|
| Moderate | 140 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Towards higher pop: Risk level to Risk level", cex.main = 1)
# png(file="figures/low_to_high_risk_to_risk.png")
#higher pop to lower pop
data <- moves %>%
filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
group_by(avg_eal_tercile_old,avg_eal_tercile_new) %>%
summarise(row_count=sum(row_count))
## `summarise()` has grouped output by 'avg_eal_tercile_old'. You can override
## using the `.groups` argument.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Toward lower pop: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.3675349 | 0.0159453 | 0.0325839 |
| Low | 0.0676439 | 0.0499158 | 0.1152818 |
| Moderate | 0.1219174 | 0.0842825 | 0.1305338 |
| Outside | 0.0036645 | 0.0092107 | 0.0014856 |
table <- moves %>%
filter(!is.na(pop_tercile_old),pop_tercile_num_old<pop_tercile_num_new) %>%
filter(avg_eal_tercile_new=="Outside") %>%
left_join(nri_cbsa_all,by=c("CBSAFP_new"="CBSAFP")) %>%
group_by(avg_eal_tercile_all) %>%
summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
kable(caption="Towards lower pop:Risk of Destinations outside top100")
table
| avg_eal_tercile_all | row_count |
|---|
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Toward lower pop: Risk level to Risk level", cex.main = 1)
# png(file="figures/high_to_low_risk_to_risk.png")
This chunk produces for same population moves, crit/non, chain/non-chain
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_crit_same_pop.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Critical Same Population Moves: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.4626455 | 0.0411587 | 0.0918508 |
| Low | 0.0439087 | 0.0399670 | 0.0880924 |
| Moderate | 0.0735173 | 0.0747090 | 0.0841507 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Critical Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device
## 1
#noncrit
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_non_crit_same_pop.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Non-Critical Same Population Moves: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.4721569 | 0.0495238 | 0.1001681 |
| Low | 0.0476190 | 0.0302521 | 0.0770868 |
| Moderate | 0.0834734 | 0.0676751 | 0.0720448 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Non-Critical Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device
## 1
#chain
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_chain_same_pop.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Chain Same Population Moves: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.3718961 | 0.0741097 | 0.1376323 |
| Low | 0.0666025 | 0.0381136 | 0.0694899 |
| Moderate | 0.1054860 | 0.0625602 | 0.0741097 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Chain Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device
## 1
#standalone
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/moves_standalone_same_pop.csv") %>%
mutate(avg_eal_tercile_old=ifelse(is.na(avg_eal_tercile_old),"Outside",avg_eal_tercile_old)) %>%
mutate(avg_eal_tercile_new=ifelse(is.na(avg_eal_tercile_new),"Outside",avg_eal_tercile_new))
## Rows: 9 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): avg_eal_tercile_old, avg_eal_tercile_new
## dbl (1): row_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data %>%
transmute(avg_eal_tercile_old,avg_eal_tercile_new,pct=row_count/sum(data$row_count)) %>%
pivot_wider(names_from = avg_eal_tercile_new,values_from = pct) %>%
kable(caption = "Standalone Same Population Moves: Risk level to Risk level")
| avg_eal_tercile_old | High | Low | Moderate |
|---|---|---|---|
| High | 0.5006490 | 0.0345652 | 0.0806749 |
| Low | 0.0381174 | 0.0347018 | 0.0879842 |
| Moderate | 0.0682424 | 0.0747319 | 0.0803334 |
chordDiagram(
x=data,
directional = 1,
direction.type = c("diffHeight", "arrows"),
link.arr.type = "big.arrow",
grid.col = c(High="red",Moderate="yellow", Low="green",Outside="gray"))
title("Standalone Same Population Moves: Risk level to Risk level", cex.main = 1)
# png(file="figures/chain_same_pop_risk_to_risk.png")
dev.off()
## null device
## 1
This chunk produces bar graphs for all 100 CBSAs and corresponding entries/exits and births/deaths for all years of data
library(ggpattern)
## Warning: package 'ggpattern' was built under R version 4.4.1
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv")
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(data)
## archive_version_year CBSAFP entry exit
## Min. :2013 Min. :10420 Min. : 1.00 Min. : 1.00
## 1st Qu.:2015 1st Qu.:19348 1st Qu.: 26.00 1st Qu.: 25.00
## Median :2018 Median :32060 Median : 45.00 Median : 44.00
## Mean :2018 Mean :30209 Mean : 67.08 Mean : 67.08
## 3rd Qu.:2020 3rd Qu.:40080 3rd Qu.: 84.00 3rd Qu.: 79.00
## Max. :2022 Max. :49340 Max. :561.00 Max. :670.00
## NA's :100
## births deaths NAME avg_eal
## Min. : 1328 Min. : 1128 Length:1000 Min. :38.21
## 1st Qu.: 3617 1st Qu.: 3416 Class :character 1st Qu.:65.60
## Median : 6166 Median : 5294 Mode :character Median :75.28
## Mean : 12411 Mean : 10836 Mean :77.28
## 3rd Qu.: 13648 3rd Qu.: 11576 3rd Qu.:94.11
## Max. :171670 Max. :139494 Max. :99.91
## NA's :100
## tercile avg_eal_tercile
## Min. :1.00 Length:1000
## 1st Qu.:1.00 Class :character
## Median :2.00 Mode :character
## Mean :1.99
## 3rd Qu.:3.00
## Max. :3.00
##
data <- data %>% gather(type,row_count, c(entry,exit,births,deaths)) %>%
group_by(NAME,avg_eal_tercile,tercile,type) %>%
summarise(row_count=sum(row_count,na.rm = TRUE)) %>%
mutate(label= paste0(type,"-",avg_eal_tercile))
## `summarise()` has grouped output by 'NAME', 'avg_eal_tercile', 'tercile'. You
## can override using the `.groups` argument.
color_ramp <- c("births-Low" = "#00b100", "births-Moderate" = "#007e00", "births-High" = "#004b00",
"deaths-Low" = "#c600c6", "deaths-Moderate" = "#9f009f", "deaths-High" = "#780078")
data %>%
filter(type %in% c("births", "deaths")) %>%
group_by(NAME) %>%
mutate(sort = sum(row_count,na.rm = T)) %>%
ungroup() %>%
ggplot(aes(x = fct_reorder(NAME, sort), y = row_count, fill = label)) +
geom_bar(position = "dodge", stat = "identity")+
coord_flip()+
theme_minimal()+
scale_fill_manual(values=color_ramp)+
ggtitle("Births and Deaths - 2013-2022")+
theme(legend.position = "bottom")
# ggsave("figures/bar_chart_births_deaths.png")
color_ramp <- c("entry-Low" = "#00b100", "entry-Moderate" = "#007e00", "entry-High" = "#004b00",
"exit-Low" = "#c600c6", "exit-Moderate" = "#9f009f", "exit-High" = "#780078")
data %>% filter(type %in% c("entry","exit")) %>%
group_by(NAME) %>%
mutate(sort = sum(row_count,na.rm = T)) %>%
ungroup() %>%
ggplot(aes(fill = label, y = row_count, x = forcats::fct_reorder(NAME, sort))) +
geom_bar(position = "dodge", stat = "identity")+
coord_flip()+
theme_minimal()+
scale_fill_manual(values=color_ramp)+
ggtitle("Entries and Exits - 2013-2022")+
theme(legend.position = "bottom")
# ggsave("figures/bar_chart_entries_exits.png")
library(ggrepel)
# Load and preprocess data (using your original code)
data <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/cbsa_births_deaths_exits_entrys.csv") %>%
gather(type, row_count, c(entry, exit, births, deaths)) %>%
group_by(NAME, avg_eal_tercile, tercile, type) %>%
summarise(row_count = sum(row_count, na.rm = TRUE)) %>%
mutate(label = paste0(type, "-", avg_eal_tercile))
## Rows: 1000 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): NAME, avg_eal_tercile
## dbl (8): archive_version_year, CBSAFP, entry, exit, births, deaths, avg_eal,...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
## `summarise()` has grouped output by 'NAME', 'avg_eal_tercile', 'tercile'. You can override using the `.groups` argument.
# Pivot to get births/deaths terciles per CBSA
plot_data <- data %>%
filter(type %in% c("births", "deaths")) %>%
select(NAME, type, tercile) %>%
pivot_wider(names_from = type, values_from = tercile) %>%
mutate(
# Convert terciles to numeric positions (Moderate = 0)
x = case_when(
births == 1 ~ -1,
births == 2 ~ 0,
births == 3 ~ 1
),
y = case_when(
deaths == 1 ~ -1,
deaths == 2 ~ 0,
deaths == 3 ~ 1
)
)
## Adding missing grouping variables: `avg_eal_tercile`
# Plot
ggplot(plot_data, aes(x = x, y = y)) +
# Reference lines (center at Moderate)
geom_vline(xintercept = 0, linetype = "dashed", color = "gray70") +
geom_hline(yintercept = 0, linetype = "dashed", color = "gray70") +
# Points (smaller size = 2)
geom_point(aes(color = avg_eal_tercile), size = 2, alpha = 0.8) +
# Non-overlapping labels
geom_text_repel(
aes(label = NAME),
size = 2.5,
max.overlaps = 20,
segment.size = 0.2
) +
# Axis labels (discrete scale)
scale_x_continuous(
breaks = c(-1, 0, 1),
labels = c("Low Births", "Moderate Births", "High Births")
) +
scale_y_continuous(
breaks = c(-1, 0, 1),
labels = c("Low Deaths", "Moderate Deaths", "High Deaths")
) +
# Color scheme (EAL terciles)
scale_color_manual(
values = c("Low" = "#00b100", "Moderate" = "#007e00", "High" = "#004b00")
) +
labs(
title = "Births vs. Deaths by CBSA (Tercile Comparison)",
x = "",
y = "",
color = "EAL Tercile"
) +
theme_minimal() +
theme(
legend.position = "bottom",
panel.grid.major = element_blank() # Remove gridlines for clarity
)
## Warning: ggrepel: 100 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
expansion_size <- read_csv("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_rawdata.csv") %>%
mutate(growth_count=employee_size_location-lag_employees,
growth_pct=growth_count/lag_employees)
## Rows: 100600 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): company, avg_eal_tercile, avg_eal_tercile_parent, parent_region
## dbl (13): archive_version_year, parent_number, fips_code, primary_naics_code...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
summary(expansion_size$growth_count)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 1.00 4.00 23.62 11.00 64975.00
expansion_size %>%
filter(growth_count <= quantile(growth_count, 0.95)) %>%
ggplot()+geom_histogram(aes(growth_count))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(expansion_size$growth_pct)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.001 0.333 0.667 3.626 2.000 10999.000
expansion_size %>%
filter(growth_pct <= quantile(growth_pct, 0.95)) %>%
ggplot()+geom_histogram(aes(growth_pct))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
process_data <- function(file_path, caption, include_yearly_column = FALSE) {
table <- read_csv(file_path) %>%
mutate(
avg_eal_tercile_parent = ifelse(is.na(avg_eal_tercile_parent), "Outside100", avg_eal_tercile_parent),
Total = Low + Moderate + High,
Low_pct = round(Low / Total, 2),
Moderate_pct = round(Moderate / Total, 2),
High_pct = round(High / Total, 2),
)
if (include_yearly_column) {
table <- table %>%
rename(year=archive_version_year) %>%
select(year,avg_eal_tercile_parent, Low, Low_pct, Moderate, Moderate_pct, High, High_pct, Total)
} else {
table <- table %>%
select(avg_eal_tercile_parent, Low, Low_pct, Moderate, Moderate_pct, High, High_pct, Total)
}
kable(table, caption = caption)
}
# All expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all.csv", "ALL Expansions, rows (hq), columns (branch)")
## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| avg_eal_tercile_parent | Low | Low_pct | Moderate | Moderate_pct | High | High_pct | Total |
|---|---|---|---|---|---|---|---|
| High | 3343 | 0.22 | 4861 | 0.32 | 6970 | 0.46 | 15174 |
| Low | 5331 | 0.40 | 4463 | 0.34 | 3412 | 0.26 | 13206 |
| Moderate | 2857 | 0.24 | 5810 | 0.50 | 3049 | 0.26 | 11716 |
| Outside100 | 4421 | 0.31 | 5441 | 0.38 | 4497 | 0.31 | 14359 |
# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly.csv", "A<< Expansions yearly, rows (hq), columns (branch)", TRUE)
## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| year | avg_eal_tercile_parent | Low | Low_pct | Moderate | Moderate_pct | High | High_pct | Total |
|---|---|---|---|---|---|---|---|---|
| 2013 | High | 678 | 0.22 | 1057 | 0.34 | 1356 | 0.44 | 3091 |
| 2013 | Low | 673 | 0.36 | 630 | 0.34 | 558 | 0.30 | 1861 |
| 2013 | Moderate | 570 | 0.26 | 986 | 0.45 | 657 | 0.30 | 2213 |
| 2013 | Outside100 | 952 | 0.31 | 1114 | 0.36 | 1028 | 0.33 | 3094 |
| 2014 | High | 202 | 0.20 | 314 | 0.31 | 498 | 0.49 | 1014 |
| 2014 | Low | 237 | 0.43 | 177 | 0.32 | 138 | 0.25 | 552 |
| 2014 | Moderate | 205 | 0.23 | 453 | 0.51 | 237 | 0.26 | 895 |
| 2014 | Outside100 | 272 | 0.31 | 325 | 0.37 | 273 | 0.31 | 870 |
| 2015 | High | 201 | 0.25 | 244 | 0.31 | 351 | 0.44 | 796 |
| 2015 | Low | 355 | 0.41 | 297 | 0.34 | 214 | 0.25 | 866 |
| 2015 | Moderate | 173 | 0.23 | 398 | 0.54 | 169 | 0.23 | 740 |
| 2015 | Outside100 | 212 | 0.28 | 301 | 0.40 | 245 | 0.32 | 758 |
| 2016 | High | 1014 | 0.21 | 1502 | 0.31 | 2282 | 0.48 | 4798 |
| 2016 | Low | 1289 | 0.43 | 929 | 0.31 | 775 | 0.26 | 2993 |
| 2016 | Moderate | 849 | 0.24 | 1763 | 0.50 | 929 | 0.26 | 3541 |
| 2016 | Outside100 | 1221 | 0.29 | 1622 | 0.38 | 1408 | 0.33 | 4251 |
| 2017 | High | 230 | 0.23 | 321 | 0.32 | 449 | 0.45 | 1000 |
| 2017 | Low | 313 | 0.46 | 217 | 0.32 | 154 | 0.23 | 684 |
| 2017 | Moderate | 172 | 0.23 | 390 | 0.53 | 176 | 0.24 | 738 |
| 2017 | Outside100 | 309 | 0.32 | 359 | 0.38 | 288 | 0.30 | 956 |
| 2018 | High | 163 | 0.21 | 249 | 0.32 | 366 | 0.47 | 778 |
| 2018 | Low | 250 | 0.44 | 183 | 0.32 | 135 | 0.24 | 568 |
| 2018 | Moderate | 180 | 0.25 | 362 | 0.50 | 180 | 0.25 | 722 |
| 2018 | Outside100 | 408 | 0.33 | 490 | 0.40 | 341 | 0.28 | 1239 |
| 2019 | High | 119 | 0.25 | 148 | 0.31 | 210 | 0.44 | 477 |
| 2019 | Low | 131 | 0.43 | 101 | 0.33 | 71 | 0.23 | 303 |
| 2019 | Moderate | 97 | 0.27 | 172 | 0.47 | 96 | 0.26 | 365 |
| 2019 | Outside100 | 124 | 0.32 | 156 | 0.40 | 112 | 0.29 | 392 |
| 2020 | High | 107 | 0.21 | 173 | 0.33 | 238 | 0.46 | 518 |
| 2020 | Low | 155 | 0.40 | 126 | 0.33 | 104 | 0.27 | 385 |
| 2020 | Moderate | 79 | 0.25 | 158 | 0.50 | 80 | 0.25 | 317 |
| 2020 | Outside100 | 120 | 0.32 | 153 | 0.40 | 106 | 0.28 | 379 |
| 2021 | High | 121 | 0.25 | 152 | 0.31 | 217 | 0.44 | 490 |
| 2021 | Low | 210 | 0.40 | 149 | 0.28 | 170 | 0.32 | 529 |
| 2021 | Moderate | 83 | 0.28 | 158 | 0.54 | 54 | 0.18 | 295 |
| 2021 | Outside100 | 114 | 0.28 | 148 | 0.36 | 145 | 0.36 | 407 |
| 2022 | High | 508 | 0.23 | 701 | 0.32 | 1003 | 0.45 | 2212 |
| 2022 | Low | 1718 | 0.38 | 1654 | 0.37 | 1093 | 0.24 | 4465 |
| 2022 | Moderate | 449 | 0.24 | 970 | 0.51 | 471 | 0.25 | 1890 |
| 2022 | Outside100 | 689 | 0.34 | 773 | 0.38 | 551 | 0.27 | 2013 |
# All expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all_10.csv", "Expansions, rows (hq), columns (branch)")
## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| avg_eal_tercile_parent | Low | Low_pct | Moderate | Moderate_pct | High | High_pct | Total |
|---|---|---|---|---|---|---|---|
| High | 979 | 0.21 | 1424 | 0.31 | 2227 | 0.48 | 4630 |
| Low | 1383 | 0.47 | 868 | 0.29 | 713 | 0.24 | 2964 |
| Moderate | 743 | 0.23 | 1718 | 0.53 | 783 | 0.24 | 3244 |
| Outside100 | 1252 | 0.31 | 1530 | 0.38 | 1253 | 0.31 | 4035 |
# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly_10.csv", "Expansions yearly, rows (hq), columns (branch)", TRUE)
## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| year | avg_eal_tercile_parent | Low | Low_pct | Moderate | Moderate_pct | High | High_pct | Total |
|---|---|---|---|---|---|---|---|---|
| 2013 | High | 225 | 0.22 | 346 | 0.33 | 466 | 0.45 | 1037 |
| 2013 | Low | 211 | 0.39 | 159 | 0.29 | 169 | 0.31 | 539 |
| 2013 | Moderate | 150 | 0.24 | 287 | 0.46 | 189 | 0.30 | 626 |
| 2013 | Outside100 | 323 | 0.30 | 374 | 0.35 | 384 | 0.36 | 1081 |
| 2014 | High | 49 | 0.19 | 77 | 0.30 | 134 | 0.52 | 260 |
| 2014 | Low | 71 | 0.44 | 55 | 0.34 | 37 | 0.23 | 163 |
| 2014 | Moderate | 41 | 0.19 | 128 | 0.60 | 43 | 0.20 | 212 |
| 2014 | Outside100 | 73 | 0.28 | 93 | 0.36 | 92 | 0.36 | 258 |
| 2015 | High | 47 | 0.20 | 68 | 0.30 | 115 | 0.50 | 230 |
| 2015 | Low | 70 | 0.42 | 50 | 0.30 | 48 | 0.29 | 168 |
| 2015 | Moderate | 49 | 0.25 | 96 | 0.49 | 51 | 0.26 | 196 |
| 2015 | Outside100 | 58 | 0.26 | 88 | 0.39 | 77 | 0.35 | 223 |
| 2016 | High | 269 | 0.21 | 401 | 0.31 | 612 | 0.48 | 1282 |
| 2016 | Low | 303 | 0.47 | 190 | 0.30 | 151 | 0.23 | 644 |
| 2016 | Moderate | 214 | 0.24 | 461 | 0.52 | 219 | 0.24 | 894 |
| 2016 | Outside100 | 255 | 0.30 | 334 | 0.40 | 251 | 0.30 | 840 |
| 2017 | High | 84 | 0.21 | 130 | 0.32 | 191 | 0.47 | 405 |
| 2017 | Low | 119 | 0.49 | 72 | 0.30 | 53 | 0.22 | 244 |
| 2017 | Moderate | 47 | 0.22 | 136 | 0.63 | 32 | 0.15 | 215 |
| 2017 | Outside100 | 129 | 0.36 | 128 | 0.36 | 103 | 0.29 | 360 |
| 2018 | High | 60 | 0.19 | 93 | 0.30 | 161 | 0.51 | 314 |
| 2018 | Low | 104 | 0.50 | 60 | 0.29 | 45 | 0.22 | 209 |
| 2018 | Moderate | 56 | 0.21 | 140 | 0.53 | 67 | 0.25 | 263 |
| 2018 | Outside100 | 110 | 0.34 | 133 | 0.41 | 82 | 0.25 | 325 |
| 2019 | High | 34 | 0.24 | 39 | 0.27 | 69 | 0.49 | 142 |
| 2019 | Low | 59 | 0.46 | 41 | 0.32 | 28 | 0.22 | 128 |
| 2019 | Moderate | 30 | 0.27 | 56 | 0.51 | 24 | 0.22 | 110 |
| 2019 | Outside100 | 49 | 0.30 | 70 | 0.43 | 45 | 0.27 | 164 |
| 2020 | High | 24 | 0.16 | 40 | 0.27 | 83 | 0.56 | 147 |
| 2020 | Low | 49 | 0.49 | 35 | 0.35 | 16 | 0.16 | 100 |
| 2020 | Moderate | 21 | 0.23 | 55 | 0.60 | 16 | 0.17 | 92 |
| 2020 | Outside100 | 36 | 0.29 | 60 | 0.48 | 28 | 0.23 | 124 |
| 2021 | High | 43 | 0.30 | 33 | 0.23 | 65 | 0.46 | 141 |
| 2021 | Low | 45 | 0.46 | 26 | 0.27 | 27 | 0.28 | 98 |
| 2021 | Moderate | 23 | 0.25 | 56 | 0.60 | 14 | 0.15 | 93 |
| 2021 | Outside100 | 32 | 0.29 | 36 | 0.33 | 41 | 0.38 | 109 |
| 2022 | High | 144 | 0.21 | 197 | 0.29 | 331 | 0.49 | 672 |
| 2022 | Low | 352 | 0.52 | 180 | 0.27 | 139 | 0.21 | 671 |
| 2022 | Moderate | 112 | 0.21 | 303 | 0.56 | 128 | 0.24 | 543 |
| 2022 | Outside100 | 187 | 0.34 | 214 | 0.39 | 150 | 0.27 | 551 |
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_all_20.csv", "Expansions, rows (hq), columns (branch)")
## Rows: 4 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (4): Low, Moderate, High, <NA>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| avg_eal_tercile_parent | Low | Low_pct | Moderate | Moderate_pct | High | High_pct | Total |
|---|---|---|---|---|---|---|---|
| High | 529 | 0.20 | 819 | 0.31 | 1337 | 0.50 | 2685 |
| Low | 886 | 0.49 | 522 | 0.29 | 409 | 0.23 | 1817 |
| Moderate | 474 | 0.23 | 1127 | 0.54 | 480 | 0.23 | 2081 |
| Outside100 | 709 | 0.29 | 1006 | 0.41 | 723 | 0.30 | 2438 |
# Yearly expansions
process_data("/Users/mac/Library/CloudStorage/OneDrive-HarvardUniversity/DataAxle Data/matrices_old/matrix_expansion_yearly_20.csv", "Expansions yearly, rows (hq), columns (branch)", TRUE)
## Rows: 40 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): avg_eal_tercile_parent
## dbl (5): archive_version_year, Low, Moderate, High, <NA>
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
| year | avg_eal_tercile_parent | Low | Low_pct | Moderate | Moderate_pct | High | High_pct | Total |
|---|---|---|---|---|---|---|---|---|
| 2013 | High | 121 | 0.20 | 200 | 0.33 | 281 | 0.47 | 602 |
| 2013 | Low | 115 | 0.41 | 78 | 0.28 | 90 | 0.32 | 283 |
| 2013 | Moderate | 74 | 0.22 | 159 | 0.48 | 99 | 0.30 | 332 |
| 2013 | Outside100 | 152 | 0.26 | 242 | 0.41 | 194 | 0.33 | 588 |
| 2014 | High | 25 | 0.15 | 50 | 0.30 | 92 | 0.55 | 167 |
| 2014 | Low | 53 | 0.49 | 37 | 0.34 | 19 | 0.17 | 109 |
| 2014 | Moderate | 25 | 0.18 | 87 | 0.63 | 26 | 0.19 | 138 |
| 2014 | Outside100 | 51 | 0.28 | 74 | 0.41 | 57 | 0.31 | 182 |
| 2015 | High | 20 | 0.14 | 44 | 0.31 | 76 | 0.54 | 140 |
| 2015 | Low | 47 | 0.44 | 36 | 0.34 | 24 | 0.22 | 107 |
| 2015 | Moderate | 26 | 0.22 | 60 | 0.51 | 32 | 0.27 | 118 |
| 2015 | Outside100 | 33 | 0.23 | 62 | 0.44 | 47 | 0.33 | 142 |
| 2016 | High | 140 | 0.20 | 220 | 0.32 | 327 | 0.48 | 687 |
| 2016 | Low | 185 | 0.45 | 122 | 0.29 | 108 | 0.26 | 415 |
| 2016 | Moderate | 160 | 0.27 | 302 | 0.51 | 130 | 0.22 | 592 |
| 2016 | Outside100 | 148 | 0.27 | 231 | 0.43 | 163 | 0.30 | 542 |
| 2017 | High | 53 | 0.20 | 83 | 0.32 | 127 | 0.48 | 263 |
| 2017 | Low | 86 | 0.52 | 51 | 0.31 | 27 | 0.16 | 164 |
| 2017 | Moderate | 28 | 0.20 | 93 | 0.65 | 22 | 0.15 | 143 |
| 2017 | Outside100 | 71 | 0.34 | 77 | 0.37 | 59 | 0.29 | 207 |
| 2018 | High | 42 | 0.20 | 60 | 0.29 | 104 | 0.50 | 206 |
| 2018 | Low | 82 | 0.49 | 51 | 0.31 | 33 | 0.20 | 166 |
| 2018 | Moderate | 42 | 0.21 | 106 | 0.54 | 49 | 0.25 | 197 |
| 2018 | Outside100 | 63 | 0.34 | 73 | 0.39 | 49 | 0.26 | 185 |
| 2019 | High | 23 | 0.27 | 24 | 0.28 | 39 | 0.45 | 86 |
| 2019 | Low | 45 | 0.49 | 29 | 0.32 | 18 | 0.20 | 92 |
| 2019 | Moderate | 19 | 0.28 | 34 | 0.50 | 15 | 0.22 | 68 |
| 2019 | Outside100 | 37 | 0.33 | 47 | 0.42 | 27 | 0.24 | 111 |
| 2020 | High | 14 | 0.15 | 20 | 0.22 | 58 | 0.63 | 92 |
| 2020 | Low | 31 | 0.49 | 25 | 0.40 | 7 | 0.11 | 63 |
| 2020 | Moderate | 13 | 0.23 | 32 | 0.56 | 12 | 0.21 | 57 |
| 2020 | Outside100 | 25 | 0.31 | 37 | 0.46 | 19 | 0.23 | 81 |
| 2021 | High | 17 | 0.24 | 14 | 0.20 | 40 | 0.56 | 71 |
| 2021 | Low | 26 | 0.57 | 7 | 0.15 | 13 | 0.28 | 46 |
| 2021 | Moderate | 17 | 0.27 | 36 | 0.56 | 11 | 0.17 | 64 |
| 2021 | Outside100 | 18 | 0.33 | 17 | 0.31 | 20 | 0.36 | 55 |
| 2022 | High | 74 | 0.20 | 104 | 0.28 | 193 | 0.52 | 371 |
| 2022 | Low | 216 | 0.58 | 86 | 0.23 | 70 | 0.19 | 372 |
| 2022 | Moderate | 70 | 0.19 | 218 | 0.59 | 84 | 0.23 | 372 |
| 2022 | Outside100 | 111 | 0.32 | 146 | 0.42 | 88 | 0.26 | 345 |