use crunchbase categories to calcultae startup complexity index

cb_tech_cbsa <- unnest_tokens(cb_all_cbsa, tech_name, Categories, token = "regex", pattern = ",") %>%
  mutate(tech_name = trimws(tech_name)) %>%
  right_join(cat_token, by = "tech_name") %>%
  filter(!is_broad) %>%
  # filter(!rm) %>%

  # calculate LQ for each msa-tech pair
  group_by(cbsa_code, cbsa_name, tech_name) %>%
  count() %>%
  ungroup() %>%
  mutate(us_total = sum(n)) %>%
  
  group_by(tech_name) %>%
  mutate(tech_us_total = sum(n),
         tech_us_share = tech_us_total / us_total) %>%

  group_by(cbsa_code, cbsa_name) %>%
  mutate(tech_msa_share = n / sum(n)) %>%
  
  mutate(lq = tech_msa_share /tech_us_share) %>%
  arrange(-lq) %>%
  ungroup()

skimr::skim(cb_tech_cbsa)
## Skim summary statistics
##  n obs: 5880 
##  n variables: 9 
## 
## -- Variable type:character -----------------------------------------------------------------------------------------------
##   variable missing complete    n min max empty n_unique
##  cbsa_code     155     5725 5880   5   5     0      315
##  cbsa_name     155     5725 5880   8  46     0      315
##  tech_name       0     5880 5880   2  47     0      653
## 
## -- Variable type:integer -------------------------------------------------------------------------------------------------
##       variable missing complete    n     mean     sd    p0   p25   p50
##              n       0     5880 5880     2.51   5.9      1     1     1
##  tech_us_total       0     5880 5880    87.42 123.54     1    14    38
##       us_total       0     5880 5880 14740      0    14740 14740 14740
##    p75  p100     hist
##      2   166 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##    105   546 <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  14740 14740 <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## 
## -- Variable type:numeric -------------------------------------------------------------------------------------------------
##        variable missing complete    n    mean       sd      p0     p25
##              lq       0     5880 5880 38.76   231.81   0.11    1.32   
##  tech_msa_share       0     5880 5880  0.054    0.13   0.00033 0.0032 
##   tech_us_share       0     5880 5880  0.0059   0.0084 6.8e-05 0.00095
##     p50     p75     p100     hist
##  3.21   12.54   7370     <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  0.01    0.038     1     <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  0.0026  0.0071    0.037 <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>

Complexity

## Skim summary statistics
##  n obs: 4906 
##  n variables: 11 
## 
## -- Variable type:character -----------------------------------------------------------------------------------------------
##   variable missing complete    n min max empty n_unique
##  cbsa_code     139     4767 4906   5   5     0      315
##  cbsa_name     139     4767 4906   8  46     0      315
##  tech_name       0     4906 4906   2  47     0      653
## 
## -- Variable type:integer -------------------------------------------------------------------------------------------------
##       variable missing complete    n     mean     sd    p0   p25   p50
##            div       0     4906 4906    82.25  69.57     1    21    57
##              n       0     4906 4906     2.3    5.72     1     1     1
##  tech_us_total       0     4906 4906    80.45 118.52     1    12    32
##            ubi       0     4906 4906    19.02  17.41     1     7    14
##       us_total       0     4906 4906 14740      0    14740 14740 14740
##    p75  p100     hist
##    137   227 <U+2587><U+2585><U+2582><U+2582><U+2583><U+2582><U+2581><U+2583>
##      2   166 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##     99   546 <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##     26    95 <U+2587><U+2586><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581>
##  14740 14740 <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## 
## -- Variable type:numeric -------------------------------------------------------------------------------------------------
##        variable missing complete    n    mean      sd      p0     p25
##              lq       0     4906 4906 46.32   253.11  1       1.98   
##  tech_msa_share       0     4906 4906  0.063    0.14  0.00033 0.0045 
##   tech_us_share       0     4906 4906  0.0055   0.008 6.8e-05 0.00081
##     p50     p75     p100     hist
##  4.61   17.55   7370     <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  0.014   0.05      1     <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  0.0022  0.0067    0.037 <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
complexity_cbsa %>%
  select(tech_name, ubi) %>%
  unique() %>%
  skimr::skim()
## Skim summary statistics
##  n obs: 653 
##  n variables: 2 
## 
## -- Variable type:character -----------------------------------------------------------------------------------------------
##   variable missing complete   n min max empty n_unique
##  tech_name       0      653 653   2  47     0      653
## 
## -- Variable type:integer -------------------------------------------------------------------------------------------------
##  variable missing complete   n mean  sd p0 p25 p50 p75 p100     hist
##       ubi       0      653 653 7.51 9.3  1   1   4  10   95 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
complexity_cbsa %>%
  select(cbsa_code, div) %>%
  unique() %>%
  skimr::skim()
## Skim summary statistics
##  n obs: 316 
##  n variables: 2 
## 
## -- Variable type:character -----------------------------------------------------------------------------------------------
##   variable missing complete   n min max empty n_unique
##  cbsa_code       1      315 316   5   5     0      315
## 
## -- Variable type:integer -------------------------------------------------------------------------------------------------
##  variable missing complete   n  mean    sd p0 p25 p50 p75 p100     hist
##       div       0      316 316 15.53 32.24  1   2   4  12  227 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
## Skim summary statistics
##  n obs: 3271 
##  n variables: 11 
## 
## -- Variable type:character -----------------------------------------------------------------------------------------------
##   variable missing complete    n min max empty n_unique
##  cbsa_code       0     3271 3271   5   5     0       83
##  cbsa_name       0     3271 3271   8  46     0       83
##  tech_name       0     3271 3271   2  30     0      278
## 
## -- Variable type:integer -------------------------------------------------------------------------------------------------
##       variable missing complete    n     mean     sd    p0   p25   p50
##              n       0     3271 3271     2.87   6.93     1     1     1
##  tech_us_total       0     3271 3271    88.19 116.31     6    20    42
##       us_total       0     3271 3271 14740      0    14740 14740 14740
##    p75  p100     hist
##      2   166 <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##    104   546 <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  14740 14740 <U+2581><U+2581><U+2581><U+2587><U+2581><U+2581><U+2581><U+2581>
## 
## -- Variable type:numeric -------------------------------------------------------------------------------------------------
##        variable missing complete    n   mean      sd       p0     p25
##             div       0     3271 3271 65.68   7.86   44.71    59.67  
##              lq       0     3271 3271  7.67  14.83    1        1.65  
##  tech_msa_share       0     3271 3271  0.022  0.024   0.00067  0.005 
##   tech_us_share       0     3271 3271  0.006  0.0079  0.00041  0.0014
##             ubi       0     3271 3271 65.68  16.95   32.75    53.73  
##      p50     p75    p100     hist
##  64.83   73.86    78.12  <U+2581><U+2581><U+2583><U+2586><U+2585><U+2585><U+2583><U+2587>
##   2.97    6.77   204.72  <U+2587><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##   0.013   0.032    0.17  <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##   0.0028  0.0071   0.037 <U+2587><U+2582><U+2581><U+2581><U+2581><U+2581><U+2581><U+2581>
##  65.4    78.44   113.29  <U+2583><U+2586><U+2587><U+2587><U+2586><U+2583><U+2582><U+2581>
ggplotly(plot_mean_ubi(complexity_cbsa_final))
cbsa_KCI <- cbsa_TCI_KCI %>%
  select(cbsa_code, cbsa_name,startup_complexity = div)%>% 
  arrange(-startup_complexity) %>%
  unique()

head(cbsa_KCI)
## # A tibble: 6 x 3
##   cbsa_code cbsa_name                                    startup_complexity
##   <chr>     <chr>                                                     <dbl>
## 1 31080     Los Angeles-Long Beach-Anaheim, CA                         78.1
## 2 35620     New York-Newark-Jersey City, NY-NJ-PA                      75.9
## 3 41940     San Jose-Sunnyvale-Santa Clara, CA                         75.7
## 4 16980     Chicago-Naperville-Elgin, IL-IN-WI                         75.2
## 5 41860     San Francisco-Oakland-Berkeley, CA                         74.8
## 6 47900     Washington-Arlington-Alexandria, DC-VA-MD-WV               74.1
tail(cbsa_KCI)
## # A tibble: 6 x 3
##   cbsa_code cbsa_name                                startup_complexity
##   <chr>     <chr>                                                 <dbl>
## 1 13820     Birmingham-Hoover, AL                                  50.4
## 2 16820     Charlottesville, VA                                    50.3
## 3 36420     Oklahoma City, OK                                      50.2
## 4 31140     Louisville/Jefferson County, KY-IN                     48.3
## 5 16860     Chattanooga, TN-GA                                     48.1
## 6 30780     Little Rock-North Little Rock-Conway, AR               44.7
library(corrplot)
## corrplot 0.84 loaded
load("data/cbsa_cor.rda")

M <- cor(cbsa_cor[3:10],use = "pairwise.complete.obs")

corrplot(M, method = "color", type ="upper",
         addCoef.col = "black", tl.col = "black",tl.srt=45)