1. Importing Data

Download a copy of this project here!

library(tidyverse)
library(sf)
library(scales)

Let’s import our key geographic data.

# Load in social infrastructure points
sites <- read_sf("raw_data/mygoogle.kml") %>%
  select(-c(Name:icon)) %>% # get rid of fluff
  select(group, geometry) # we only need these two columns

# Load in county polygon
county <- read_sf("raw_data/county.kml") %>%
  select(-c(Name:icon)) # get rid of fluff

# Load in zipcode polygons
zip <- read_sf("raw_data/zipcodes.kml") %>%
  select(-c(Name:icon))# get rid of fluff

# Load in grid
grid <- read_sf("raw_data/grid_covariates_tracts.kml") %>%
  select(-c(Name:icon)) # get rid of fluff

Let’s import our Facebook data!

fb <- read_csv("raw_data/boston_fb_sci.csv")

2. Viewing the Data

As our next step, let’s try to view some of this data!

First, let’s check what are the contents of each dataset!

Each row here is a social infrastructure site!

sites %>% head(2)
## Simple feature collection with 2 features and 1 field
## Geometry type: POINT
## Dimension:     XYZ
## Bounding box:  xmin: -71.10646 ymin: 42.31905 xmax: -71.045 ymax: 42.34176
## z_range:       zmin: 0 zmax: 0
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 2
##   group                            geometry
##   <chr>                         <POINT [°]>
## 1 Community Spaces   Z (-71.045 42.31905 0)
## 2 Community Spaces Z (-71.10646 42.34176 0)

Each row here is a county. (Just 1!)

county %>% head(2)
## Simple feature collection with 1 feature and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.19115 ymin: 42.22793 xmax: -70.9226 ymax: 42.45012
## Geodetic CRS:  WGS 84
## # A tibble: 1 × 3
##   geoid area_land                                                       geometry
##   <chr>     <dbl>                                             <MULTIPOLYGON [°]>
## 1 25025 150863059 (((-70.93091 42.3216, -70.93025 42.32229, -70.92597 42.32191,…

Each row here is a 1 km2 grid cell!

grid %>% head(2)
## Simple feature collection with 2 features and 21 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.15658 ymin: 42.23028 xmax: -71.12166 ymax: 42.2548
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 22
##   cell_id neighborhood milestone       pop_density pop_women pop_white pop_black
##   <chr>   <chr>        <chr>                 <dbl>     <dbl>     <dbl>     <dbl>
## 1 Cell 1  Hyde Park    M4: Outer Bost…       2768.     0.551     0.411     0.466
## 2 Cell 10 Hyde Park    M4: Outer Bost…       3124.     0.531     0.401     0.410
## # … with 15 more variables: pop_natam <dbl>, pop_asian <dbl>,
## #   pop_pacific <dbl>, pop_hisplat <dbl>, pop_some_college <dbl>,
## #   employees_muni <dbl>, employees_state <dbl>, employees_fed <dbl>,
## #   median_income <dbl>, income_inequality <dbl>, pop_unemployed <dbl>,
## #   median_monthly_housing_cost <dbl>, pop_age_65_plus <dbl>,
## #   pop_density_int <dbl>, geometry <POLYGON [°]>

Each row here is a zipcode! The geoid column describes the zipcode.

zip %>% head(2)
## Simple feature collection with 2 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.0856 ymin: 42.34541 xmax: -71.05719 ymax: 42.36086
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 3
##   geoid area_land                                                       geometry
##   <chr>     <dbl>                                             <MULTIPOLYGON [°]>
## 1 02108    353358 (((-71.07522 42.35551, -71.07317 42.35646, -71.07231 42.3588,…
## 2 02199    148225 (((-71.0856 42.34788, -71.08375 42.34838, -71.07951 42.34951,…

Each row in fb is a pair of zipcodes, where scaled_sci shows how many Facebook friends connect those two zipcodes. For privacy, it’s been scaled from 1 to 1 billion.

fb %>% head(2)
## # A tibble: 2 × 3
##   from  to    scaled_sci
##   <chr> <chr>      <dbl>
## 1 02021 02021   15961499
## 2 02021 02026    1591400

Let’s try to visualize this data, on a map. Everything except for fb can be readily visualized on a map, using the ggplot() and geom_sf() functions. We’ll also want to crop the map using coord_sf() for readability.

ggplot() +
  # Map my zipcode boundaries
  geom_sf(data = zip) +
  # Map my points, using a nice shade of blue, with 50% transparency (alpha)
  geom_sf(data = sites, color = "steelblue", alpha = 0.5) +
  # Crop the map
  coord_sf(xlim = c(-71.14, -71.02),
           ylim = c(42.27, 42.38))

3. Statistics

We might also want to gather some statistics about our zipcodes.

For example… what is the average overall connectivity score for each zipcode? We can calculate this for each zipcode using the from variable as our zipcode id and the group_by(), summarize(), and mean() functions.

stat <- fb %>%
  group_by(from) %>%
  summarize(mean = mean(scaled_sci, na.rm = TRUE))

Note: it’s always important to add na.rm = TRUE to functions like mean, median, sd, max, etc., because it tells R to exclude cases where values are missing. Otherwise, we’ll just receive a mean of NA.

Let’s check out the first 2 rows of our result! Two suburban neighborhoods, around Dedham and Canton.

stat %>% head(2)
## # A tibble: 2 × 2
##   from     mean
##   <chr>   <dbl>
## 1 02021 887578.
## 2 02026 999484.

You might be wondering, is that a large number, compared to the rest of our sampled zipcodes? We can use summarize() to find out.

Okay, so there’s a REALLY BIG DIFFERENCE between some zipcodes’ connectivity here.

fb %>% 
  summarize(
    min = min(scaled_sci, na.rm = TRUE),
    mean = mean(scaled_sci, na.rm = TRUE),
    median = median(scaled_sci, na.rm = TRUE),
    max = max(scaled_sci, na.rm = TRUE),
    sd = sd(scaled_sci, na.rm = TRUE))
## # A tibble: 1 × 5
##     min    mean median      max       sd
##   <dbl>   <dbl>  <dbl>    <dbl>    <dbl>
## 1 69086 940896. 635246 42070495 1578778.

You could find out which zipcode pairs have the min and max connectivity scores, using filter().

fb %>%
  filter(scaled_sci == max(scaled_sci, na.rm = TRUE))
## # A tibble: 1 × 3
##   from  to    scaled_sci
##   <chr> <chr>      <dbl>
## 1 02152 02152   42070495
fb %>%
  filter(scaled_sci == min(scaled_sci, na.rm = TRUE))
## # A tibble: 2 × 3
##   from  to    scaled_sci
##   <chr> <chr>      <dbl>
## 1 02150 02492      69086
## 2 02492 02150      69086

And then you could google them to find out which neighborhoods these zipcodes pertain to.

I recommend that whenever you find something out, you just got it down as a note in your code script, so you have a written record of everything you do and thought in one place.

4. Visualizing our Statistics

Finally, we can take our data and visualize it on a map, as long as we first join together the data.frame of statistics with the geospatial data.frames we need.

Eg. if I want a map of zipcodes, shaded by mean connectivity, I need to get the mean connectivity of zipcodes in data.frame A, get the polygons in data.frame B, and join them together.

Earlier, we calculated the mean overall pairwise-connectivity for each zipcode, using the from column. We saved it in a data.frame called stat.

stat <- fb %>%
  group_by(from) %>%
  summarize(mean = mean(scaled_sci, na.rm = TRUE))

Let’s combine that with our zipcode polygons, saved in zip! zip records its zipcode ID in the geoid column; while stat records its zipcode ID in the from column.

We can join them together using left_join(), which adds in values from stat into our original zip data.frame. (Joins data from the right-side into the left-side dataset.)

zip2 <- zip %>% 
  left_join(by = c("geoid" = "from"), y = stat)

We can see zip2 now contains a column called mean!

zip2 %>%
  select(geoid, mean, geometry) %>%
  head(2)
## Simple feature collection with 2 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.0856 ymin: 42.34541 xmax: -71.05719 ymax: 42.36086
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 3
##   geoid    mean                                                         geometry
##   <chr>   <dbl>                                               <MULTIPOLYGON [°]>
## 1 02108 750762. (((-71.07522 42.35551, -71.07317 42.35646, -71.07231 42.3588, -…
## 2 02199     NA  (((-71.0856 42.34788, -71.08375 42.34838, -71.07951 42.34951, -…

Finally, we can visualize it like this, telling ggplot to ‘map’ the data from the mean column in zip2 to be the fill (shading) for each polygon.

ggplot() +
  # My zipcodes
  geom_sf(data = zip2, mapping = aes(fill = mean)) +
  # Points  
  geom_sf(data = sites) +
  # Crop the map
  coord_sf(xlim = c(-71.14, -71.02),
           ylim = c(42.27, 42.38))

5. Rescaling

Next, we probably want to make a solution for those enormous social connectivity index numbers. In this case, we’re going to rescale them. First, we will log-transform them, which lessens the distance between increasingly large observations. And then, we will rescale that data so that the minimum logged observation becomes 0 and the max logged value becomes 1. This gives us a much nicer, much more natural measure of connectivity.

library(scales) # for rescaling
fb <- read_csv("raw_data/boston_fb_sci.csv") %>% 
  mutate(scaled_sci = rescale(log(scaled_sci), to = c(0, 1))) 

Let’s see what it looks like now with summary() (different from summarize()).

fb$scaled_sci %>%  summary()
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2888  0.3460  0.3564  0.4030  1.0000

Much easier to interpret!

6. Data Wrangling

Let’s import our dataset of all known social infrastructure points, both API generated and not. We’ll call it allsites.

library(tidyverse)
library(sf)

# We're going to import  
allsites <- read_csv("raw_data/boston_social_infra_2022_03_03.csv") %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# And let's reimport the grid
grid <- read_sf("raw_data/grid_covariates_tracts.kml") %>%
  select(-c(Name:icon))


ggplot() +
  geom_sf(data = grid)  +
  geom_sf(data = allsites)

Let’s make one single dataset, where each row is a zipcode, containing rates of social infrastructure and rates of connectivity.

First, we’ll get the rates of connectivity within zipcodes.

mywithin <- fb %>%
  filter(from == to) %>%
  # For each zipcode, please get the total ties to residents within the same zipcode
  group_by(zip = from) %>%
  summarize(within = sum(scaled_sci, na.rm = TRUE)) %>%
  ungroup() %>%
  # Then rescale that from 0 to 1
  mutate(within = within %>% rescale(to = c(0, 1)))

mywithin %>% head(3)
## # A tibble: 3 × 2
##   zip   within
##   <chr>  <dbl>
## 1 02021 0.686 
## 2 02026 0.612 
## 3 02108 0.0623

Second, we’ll get the rates of connectivity between zipcodes.

mybetween <- fb %>%
  filter(from != to) %>%
  # For each zipcode, please get the total ties to other zipcodes
  group_by(zip = from) %>%
  summarize(between = sum(scaled_sci, na.rm = TRUE)) %>%
  ungroup() %>%
  # Then rescale that from 0 to 1
  mutate(between = between %>% rescale(to = c(0, 1)))

mybetween %>% head(3)
## # A tibble: 3 × 2
##   zip   between
##   <chr>   <dbl>
## 1 02021   0.296
## 2 02026   0.534
## 3 02108   0.622

Next, we’ll tally up social infrastructure sites.

allsites %>% 
  as_tibble() %>% 
  group_by(cell_id, type) %>%
  summarize(count = n()) %>%
  head()
## # A tibble: 6 × 3
## # Groups:   cell_id [2]
##   cell_id  type              count
##   <chr>    <chr>             <int>
## 1 Cell 10  Parks                 4
## 2 Cell 10  Places of Worship     2
## 3 Cell 10  Social Businesses     1
## 4 Cell 100 Community Spaces      1
## 5 Cell 100 Parks                 1
## 6 Cell 100 Social Businesses     1

There’s a faster, albeit a little less clean way to do this. Let’s count up social infrastructure by type, join in cell population density, and get the rate of sites per 1000 residents.

myrates <- allsites %>%
  as_tibble() %>%
  group_by(cell_id) %>%
  summarize(
    total = sum(!is.na(type), na.rm = TRUE),
    community_spaces = sum(type == "Community Spaces", na.rm = TRUE),
    parks = sum(type == "Parks", na.rm = TRUE),
    social = sum(type == "Social Businesses", na.rm = TRUE),
    places_of_worship = sum(type == "Places of Worship", na.rm = TRUE)) %>%
  # Join in population density
  left_join(by = "cell_id", 
            y = grid %>% as_tibble() %>% 
              select(cell_id, pop_density_int)) %>%
  # Calculate rate of social infrastructure per 1000 residents per square kilometer
  mutate(
    total = total / pop_density_int * 1000,
    community_spaces = community_spaces / pop_density_int * 1000,
    parks = parks / pop_density_int * 1000,
    social = social / pop_density_int * 1000,
    places_of_worship = places_of_worship / pop_density_int * 1000) %>%
  # Drop excess variables
  select(-pop_density_int) %>%
  # Join rates into grid
  right_join(by = "cell_id", y = grid) %>%
  # Then fill in missing values with zero, because it means there were no sites there, in this case
  mutate_at(vars(total, community_spaces, parks,social, places_of_worship),
                list(~if_else(is.na(.), 0, .))) %>%
  # Turn back into an sf object
  st_as_sf(crs = 4326) 

# Here are the variables in this one.
myrates %>% names()
##  [1] "cell_id"                     "total"                      
##  [3] "community_spaces"            "parks"                      
##  [5] "social"                      "places_of_worship"          
##  [7] "neighborhood"                "milestone"                  
##  [9] "pop_density"                 "pop_women"                  
## [11] "pop_white"                   "pop_black"                  
## [13] "pop_natam"                   "pop_asian"                  
## [15] "pop_pacific"                 "pop_hisplat"                
## [17] "pop_some_college"            "employees_muni"             
## [19] "employees_state"             "employees_fed"              
## [21] "median_income"               "income_inequality"          
## [23] "pop_unemployed"              "median_monthly_housing_cost"
## [25] "pop_age_65_plus"             "pop_density_int"            
## [27] "geometry"
mydat <- myrates %>%
  # Join in zipcode ID
  st_join(zip, left = TRUE) %>%
  # For each zipcode,
  group_by(geoid) %>%
  # Calculate the median level of concept X in a city block in that zipcode.
  summarize_at(vars(pop_density, total, community_spaces, parks,social, places_of_worship,
                     pop_age_65_plus, pop_women:pop_some_college, 
                    median_income, income_inequality, pop_unemployed, median_monthly_housing_cost),
               list(~median(., na.rm = TRUE))) %>%
  # Then join in our zipcode FB measures
  left_join(by = c("geoid" = "zip"), y = mywithin) %>%
  left_join(by = c("geoid" = "zip"), y = mybetween) 

Finally, let’s trim this dataset, by zooming into just the zipcodes located within the core Boston zone. Let’s examine.

# Load in zipcode polygons
zip <- read_sf("raw_data/zipcodes.kml") %>%
  select(-c(Name:icon))# get rid of fluff

ggplot() +
  geom_sf(data = zip %>%
            st_join(myrates, left = FALSE), color = "blue", fill = "grey") +
  geom_sf(data = myrates, fill = NA)

Let’s save this file as an .rds file, in R formatting, so we can easily access it without breaking up the geometry.

mydat %>%
  write_rds("raw_data/dataset.rds")

And let’s get rid of the rest of the data here.

rm(list = ls())

7. Correlation

Next, we’re going to investigate the correlations between social infrastructure and social connectivity.

Let’s import our data.

mydat <- read_rds("raw_data/dataset.rds") %>%
  # Because our variables are strongly right skewed, we need to log-transform them
  # However, a few cells report zero social infrastructure sites.
  # We're going to impute them with half the value of the smallest nonzero value,
  # and then log-transform, to maintain the general shape of the distribution
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~if_else(. == 0, true = sort(unique(.))[2] / 2, false =  .))) %>%
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~log(.)))

Each time we calculate a statistic, we need to convert to tibble first, and then run our calculation, because otherwise the mapping-formatting messes it up.

bind_rows(
  mydat %>%
    as_tibble() %>%
    summarize_at(vars(total:median_monthly_housing_cost), 
                 list(~cor(x = ., within, use = "pairwise.complete.obs"))) %>%
    mutate(ties = "Within Zipcode"),
  
  mydat %>%
    as_tibble() %>%
    summarize_at(vars(total:median_monthly_housing_cost), 
                 list(~cor(x = ., between, use = "pairwise.complete.obs"))) %>%
    mutate(ties = "Different Zipcodes")
) %>%
  pivot_longer(cols = c(total:median_monthly_housing_cost),
               names_to = "variable", values_to = "statistic") %>%
  write_csv("raw_data/correlations.csv")

Wow! That was fast!

8. Visualizing Correlations

Let’s find some clever ways to intuitively visualize this data.

mycor <- read_csv("raw_data/correlations.csv")

Let’s check out our values!

mycor
ties variable statistic
Within Zipcode total -0.545
Within Zipcode community_spaces -0.238
Within Zipcode parks -0.476
Within Zipcode social -0.603
Within Zipcode places_of_worship -0.273
Within Zipcode pop_age_65_plus 0.012
Within Zipcode pop_women -0.044
Within Zipcode pop_white -0.153
Within Zipcode pop_black 0.181
Within Zipcode pop_natam -0.085
Within Zipcode pop_asian -0.194
Within Zipcode pop_pacific -0.231
Within Zipcode pop_hisplat 0.182
Within Zipcode pop_some_college 0.538
Within Zipcode median_income -0.169
Within Zipcode income_inequality -0.472
Within Zipcode pop_unemployed 0.075
Within Zipcode median_monthly_housing_cost -0.427
Different Zipcodes total 0.708
Different Zipcodes community_spaces 0.399
Different Zipcodes parks 0.681
Different Zipcodes social 0.662
Different Zipcodes places_of_worship 0.644
Different Zipcodes pop_age_65_plus -0.041
Different Zipcodes pop_women -0.053
Different Zipcodes pop_white -0.408
Different Zipcodes pop_black 0.491
Different Zipcodes pop_natam 0.470
Different Zipcodes pop_asian -0.190
Different Zipcodes pop_pacific 0.078
Different Zipcodes pop_hisplat 0.014
Different Zipcodes pop_some_college 0.039
Different Zipcodes median_income -0.057
Different Zipcodes income_inequality 0.536
Different Zipcodes pop_unemployed 0.342
Different Zipcodes median_monthly_housing_cost 0.004

First, let’s look at social infrastructure.

mycor %>%
  filter(variable %in% c("total", "parks", "community_spaces", 
                         "places_of_worship", "social")) %>%
  ggplot(mapping = aes(x = ties, y = variable, 
                       fill = statistic, label = round(statistic, 2))) +
  geom_tile() +
  geom_text(color = "white", fontface = "bold", size = 5) +
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)

Now, using Khushi’s excellent visualization design, let’s convert this into a barchart.

mycor %>%
  filter(variable %in% c("total", "social", "community_spaces",
                         "places_of_worship", "parks")) %>%
  mutate(variable = variable %>% factor() %>% recode_factor(
    "total" = "Total",
    "community_spaces" = "Community\nSpaces",
    "places_of_worship" = "Places\nof Worship",
    "social" = "Social\nBusineses",
    "parks" = "Parks"),
    textposition = if_else(ties == "Different Zipcodes",
                                      true = statistic + 0.1, false = statistic - 0.1)) %>%
  ggplot(mapping = aes(x = variable, y = statistic, fill = ties, label = round(statistic, 2))) +
  geom_col() +
  geom_text(mapping = aes(y = textposition)) +
  scale_fill_manual(values = c("#648FFF", "#DC267F"), name = NULL) +
  ylim(c(-1, 1)) +
  theme_minimal(base_size = 14) +
  theme(legend.position = c(0.20, 0.99),
        panel.grid = element_blank(),
        axis.line.y.left = element_line(color = "black", size= 0.5)) +
  labs(x = "Type of Social Infrastructure", 
       y = "Correlation (-1 to 1)\n(between Ties and Social Infrastructure)")

Next, let’s look at demographics.

mycor %>% 
  filter(!variable %in% c("total", "parks", "community_spaces", 
                          "places_of_worship", "social")) %>%
  ggplot(mapping = aes(x = ties, y = variable, 
                       fill = statistic, label = round(statistic, 2))) +
  geom_tile() +
  geom_text(color = "black") +
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)

Using Khushi’s visualization code, let’s convert this into a bar chart too!

mycor %>%
  filter(!variable %in% c("total", "parks", "community_spaces", 
                          "places_of_worship", "social")) %>%
  mutate(variable = variable %>% factor() %>%
           recode_factor(
             "pop_women" = "% Women",
             "pop_age_65_plus" = "% Over Age 65",
             "pop_black" = "% Black",
             "pop_white" = "% White",
             "pop_hisplat" = "% Hispanic/Latino",
             "pop_asian" = "% Asian",
             "pop_natam" = "% Native American",
             "pop_pacific" = "% Pacific Islander",
             "pop_some_college" = "% Some College",
             "pop_unemployed" = "% Unemployed",
             "median_income" = "Median Income",
             "median_monthly_housing_cost" =  "Median Monthly\nHousing Cost", 
             "income_inequality" = "Gini Coefficient") %>%
           factor(levels = levels(.) %>% rev()),
         statistic = round(statistic, 2)) %>%
  ggplot(mapping = aes(x = variable,
                       y = statistic,
                       color = ties,
                       fill = ties,
                       label = statistic)) +
  geom_point(size = 3, shape = 21, color = "white") +
  geom_segment(
    mapping = aes(x = variable, xend = variable, 
                  y = 0, yend = statistic),
    position = "dodge") +
  geom_text(data = . %>%
              filter(ties == "Different Zipcodes"),
            mapping = aes(label = statistic),
            nudge_x = 0.5) +
  geom_text(data = . %>%
              filter(ties == "Within Zipcode"),
            mapping = aes(label = statistic),
            nudge_x = -0.5) +
  geom_hline(yintercept = 0) +
  scale_fill_manual(values = c("#648FFF", "#DC267F") %>% rev(), name = NULL) +
  scale_color_manual(values = c("#648FFF", "#DC267F") %>% rev(), name = NULL) +
  scale_x_discrete(expand = expansion(add = c(1, 1))) +
  coord_flip() +
  theme_bw(base_size = 14) + 
  theme(legend.position = "bottom",
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank()) +
  labs(x = NULL,
       y = "Correlation (-1 to 1)\n(between Social Infrastructure and Ties)",
       fill = NULL, color = NULL)

# 9. Scatterplot

mydat <- read_rds("raw_data/dataset.rds") %>%
  # Because our variables are strongly right skewed, we need to log-transform them
  # However, a few cells report zero social infrastructure sites.
  # We're going to impute them with half the value of the smallest nonzero value,
  # and then log-transform, to maintain the general shape of the distribution
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~if_else(. == 0, true = sort(unique(.))[2] / 2, false =  .))) 

Let’s gather some very simple models too.

mw <- mydat %>%
    lm(formula = within ~ log(total))

mb <- mydat %>%
    lm(formula = between ~ log(total))
    
library(broom)

# Now, let's gather some basic data about our simple models,
# include the R-squard, fstatistic, p-value, and sigma
mystat <- bind_rows(
  tidy(mw) %>%
    filter(term == "log(total)") %>%
    bind_cols(
      mw %>% glance() %>% 
        select(r = r.squared, fstat = statistic, fstat_p = p.value, sigma)) %>%
    mutate(variable = "Within Zipcode"),
  
  tidy(mb) %>%
    filter(term == "log(total)") %>%
    bind_cols(
      mb %>% glance() %>% 
        select(r = r.squared, fstat = statistic, fstat_p = p.value, sigma)) %>%
    mutate(variable = "Different Zipcodes")
) %>%
  mutate(estimate = round(estimate, 2),
         statistic = round(statistic, 2),
         plab = round(p.value, 3),
         plab = if_else(plab == "0", "p < 0.001", paste("p = ", plab, sep = "")),
         r = round(r, 2),
         sigma = round(sigma, 2),
         fstat = round(fstat, 2),
         
         label = paste("Beta = ", estimate, gtools::stars.pval(p.value), 
                       "\n", "Variation Explained: ", r*100,"%", "\n", 
                       "Avg. Error: ", sigma, sep = ""))
mydat %>%
  pivot_longer(cols = c(within, between), 
               names_to = "variable", values_to = "value") %>%
  mutate(variable = variable %>% recode_factor(
    "between" = "Different Zipcodes",
    "within" = "Within Zipcode")) %>%
  ggplot(mapping = aes(x = total, y = value, fill = variable, color = variable, group = variable)) +
  geom_point(shape = 21, size = 5, color = 'white') +
  geom_smooth(method = "lm", formula = y ~ log(x), color = "white") +
  shadowtext::geom_shadowtext(data = mystat, mapping = aes(group = variable, x = 0.5, y = c(0.75, 0.25), label = label),
                              bg.r = 0.2, bg.color = "white", color = "#373737", hjust = 0, fontface = "bold") +
  facet_wrap(~variable) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("#648FFF", "#DC267F")) +
  guides(fill = "none", color = "none") +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        panel.spacing.x = unit(0.5, "cm"),
        strip.background = element_blank(),
        panel.border = element_rect(color = "#373737", fill = NA),
        axis.line = element_blank()) +
  scale_x_continuous(expand = expansion(add = c(0.25, -0.5))) +
  labs(caption = "<b>Note</b>: Based on simple bivariate OLS regression model.
       <br><b>Model Equation</b>: Social Connectivity ~ log(Social Infrastructure Rate)
       <br>*<b>Rate</b>: Zipcode-averaged rate of social infrastructure, per 1000 residents, per 1 km<sup>2</sup>.",
       x = "Rate of Total Social Infrastructure*",
       y = "Social Connectivity (0 to 1)")
## Warning: Removed 6 rows containing non-finite values (stat_smooth).
## Warning: Removed 6 rows containing missing values (geom_point).

10. Data Revisited

Also, for context, what did the raw dataset look like? Let’s check it out.

# Import raw data
mydat <- read_rds("raw_data/dataset.rds") %>%
    # Convert to data.frame/tibble for easy reading
  as_tibble() %>%
  select(geoid, 
         # Add social connectivity
         within, between, 
         # Add social infrastructure
         total:places_of_worship, 
         # Add Demographics
         pop_density, pop_age_65_plus:median_monthly_housing_cost)
mydat
geoid within between total community_spaces parks social places_of_worship pop_density pop_age_65_plus pop_women pop_white pop_black pop_natam pop_asian pop_pacific pop_hisplat pop_some_college median_income income_inequality pop_unemployed median_monthly_housing_cost
02021 0.686 0.296 0.000 0.000 0.000 0.000 0.000 1855.807 0.127 0.490 0.456 0.292 0.000 0.072 0.000 0.214 0.257 81063.00 0.370 0.036 1485.000
02026 0.612 0.534 0.406 0.000 0.222 0.000 0.000 2633.603 0.165 0.538 0.505 0.292 0.000 0.078 0.000 0.169 0.182 76007.50 0.424 0.036 1473.000
02108 0.062 0.622 4.370 0.507 1.820 0.611 0.498 12346.511 0.148 0.519 0.790 0.030 0.001 0.126 0.000 0.074 0.058 108540.36 0.554 0.043 2348.825
02109 0.247 0.886 3.089 0.119 1.590 0.446 0.345 13100.245 0.117 0.461 0.788 0.073 0.001 0.084 0.001 0.243 0.083 105463.36 0.517 0.044 2224.462
02110 0.019 0.641 3.798 0.197 1.731 0.617 0.289 10060.986 0.117 0.465 0.768 0.049 0.001 0.116 0.001 0.138 0.071 105463.36 0.520 0.046 2224.462
02111 0.117 0.810 3.494 0.299 1.897 0.499 0.399 10060.986 0.140 0.497 0.739 0.030 0.000 0.167 0.000 0.082 0.072 108900.12 0.521 0.053 2159.500
02113 NA NA 5.258 0.837 2.498 1.159 0.765 13100.245 0.130 0.486 0.821 0.049 0.001 0.084 0.001 0.138 0.083 105463.36 0.517 0.037 2224.462
02114 0.075 0.779 3.479 0.239 1.739 0.458 0.574 12346.511 0.142 0.510 0.822 0.045 0.000 0.091 0.000 0.083 0.072 108900.12 0.512 0.034 2355.200
02115 0.079 0.644 2.030 0.164 0.916 0.530 0.205 13733.234 0.133 0.539 0.610 0.150 0.002 0.151 0.001 0.115 0.062 53537.43 0.546 0.098 1776.500
02116 0.049 0.783 2.693 0.252 1.094 0.499 0.395 12507.108 0.144 0.528 0.711 0.053 0.002 0.148 0.000 0.092 0.061 97113.46 0.577 0.062 2100.727
02118 0.233 0.938 2.672 0.231 1.143 0.382 0.368 10973.122 0.109 0.514 0.542 0.200 0.003 0.110 0.000 0.209 0.097 62586.51 0.547 0.093 1467.006
02119 0.409 0.965 3.330 0.392 1.196 0.399 0.712 8357.402 0.113 0.523 0.278 0.476 0.006 0.041 0.000 0.286 0.141 40614.00 0.523 0.113 1231.333
02120 0.401 0.857 2.693 0.369 1.308 0.449 0.561 9356.877 0.116 0.535 0.538 0.208 0.003 0.122 0.000 0.186 0.087 48611.25 0.506 0.099 1408.500
02121 0.455 0.970 2.426 0.194 1.077 0.301 0.989 8451.741 0.111 0.502 0.188 0.586 0.007 0.022 0.000 0.291 0.166 40348.27 0.507 0.140 1288.000
02122 0.340 0.852 1.103 0.123 0.473 0.247 0.239 6086.799 0.117 0.517 0.561 0.222 0.001 0.088 0.000 0.124 0.144 80114.60 0.455 0.063 1685.000
02124 0.394 1.000 1.534 0.136 0.348 0.244 0.362 6159.891 0.123 0.520 0.302 0.499 0.003 0.071 0.000 0.140 0.168 64466.46 0.460 0.082 1672.500
02125 0.252 0.878 1.418 0.150 0.600 0.247 0.279 8084.176 0.104 0.499 0.404 0.259 0.002 0.109 0.000 0.232 0.116 62040.50 0.506 0.079 1493.800
02126 0.442 0.950 0.850 0.000 0.152 0.153 0.425 5377.362 0.139 0.521 0.171 0.691 0.003 0.012 0.000 0.181 0.198 61934.20 0.467 0.102 1670.667
02127 0.452 0.924 0.750 0.124 0.567 0.114 0.127 8031.003 0.077 0.431 0.795 0.073 0.000 0.053 0.000 0.192 0.090 114139.40 0.490 0.029 2008.000
02128 0.340 0.542 0.591 0.000 0.228 0.000 0.000 3899.365 0.114 0.465 0.694 0.039 0.001 0.046 0.002 0.415 0.120 70281.20 0.465 0.052 1647.667
02129 0.451 0.822 0.523 0.000 0.337 0.061 0.000 4137.486 0.111 0.481 0.830 0.061 0.000 0.069 0.000 0.096 0.092 116227.33 0.489 0.022 2064.400
02130 0.145 0.780 1.382 0.298 0.933 0.127 0.428 4679.716 0.149 0.542 0.645 0.172 0.003 0.045 0.000 0.197 0.115 88313.98 0.448 0.090 1830.792
02131 0.296 0.804 0.948 0.000 0.239 0.214 0.372 4475.363 0.127 0.518 0.598 0.256 0.003 0.023 0.000 0.246 0.182 79330.00 0.436 0.077 1688.500
02132 0.428 0.695 1.048 0.000 0.349 0.237 0.237 2492.811 0.193 0.531 0.800 0.041 0.000 0.082 0.000 0.083 0.138 99105.00 0.404 0.041 1688.500
02134 0.128 0.561 0.811 0.000 0.061 0.227 0.069 4988.269 0.049 0.513 0.600 0.070 0.002 0.220 0.000 0.132 0.076 68228.90 0.475 0.056 1920.500
02135 0.090 0.651 1.048 0.055 0.283 0.288 0.000 6285.313 0.090 0.513 0.698 0.059 0.001 0.152 0.000 0.119 0.093 78681.36 0.445 0.035 1846.357
02136 0.329 0.880 0.840 0.000 0.234 0.217 0.293 3478.778 0.131 0.520 0.411 0.437 0.000 0.034 0.000 0.246 0.206 76628.75 0.437 0.076 1594.500
02138 0.502 0.329 0.134 0.000 0.134 0.000 0.000 2651.759 0.062 0.505 0.601 0.077 0.001 0.213 0.000 0.122 0.080 74388.25 0.475 0.065 1987.167
02139 0.345 0.596 0.296 0.069 0.090 0.069 0.069 4988.269 0.038 0.537 0.623 0.063 0.002 0.220 0.000 0.121 0.041 61839.33 0.548 0.096 2092.667
02141 0.149 0.625 0.848 0.000 0.636 0.141 0.000 4274.837 0.114 0.487 0.862 0.048 0.000 0.052 0.000 0.061 0.083 131138.50 0.431 0.016 2461.100
02142 0.000 0.752 1.695 0.000 1.086 0.197 0.000 12185.914 0.142 0.510 0.838 0.045 0.000 0.075 0.000 0.083 0.062 119714.00 0.507 0.034 2415.250
02143 0.200 0.516 0.280 0.000 0.093 0.186 0.000 3482.079 0.088 0.476 0.869 0.053 0.000 0.041 0.000 0.045 0.083 136980.75 0.405 0.014 2347.750
02145 0.301 0.539 0.487 0.000 0.186 0.000 0.000 2363.138 0.086 0.465 0.885 0.052 0.000 0.029 0.000 0.039 0.074 142563.00 0.378 0.013 2567.000
02148 0.176 0.353 0.000 0.000 0.000 0.000 0.000 2374.588 0.129 0.511 0.807 0.083 0.000 0.056 0.000 0.228 0.116 72721.00 0.434 0.030 1689.000
02149 0.337 0.338 0.000 0.000 0.000 0.000 0.000 3673.953 0.126 0.481 0.693 0.079 0.000 0.039 0.000 0.370 0.136 61254.50 0.427 0.049 1581.000
02150 0.509 0.339 0.000 0.000 0.000 0.000 0.000 7115.202 0.115 0.479 0.590 0.065 0.001 0.041 0.000 0.519 0.134 58841.40 0.443 0.056 1496.750
02151 0.346 0.380 0.000 0.000 0.000 0.000 0.000 3372.973 0.144 0.505 0.788 0.055 0.000 0.052 0.000 0.279 0.143 61658.00 0.462 0.053 1499.375
02152 1.000 0.420 0.000 0.000 0.000 0.000 0.000 3820.653 0.139 0.513 0.892 0.023 0.001 0.009 0.000 0.137 0.204 76974.00 0.434 0.050 1571.333
02163 0.955 0.147 0.977 0.293 0.391 0.098 0.195 3286.801 0.049 0.513 0.604 0.074 0.002 0.218 0.000 0.128 0.059 70879.08 0.483 0.087 2092.667
02171 0.432 0.376 0.000 0.000 0.000 0.000 0.000 3270.169 0.077 0.448 0.770 0.174 0.000 0.070 0.000 0.292 0.115 90893.00 0.419 0.040 1731.333
02176 0.613 0.082 0.000 0.000 0.000 0.000 0.000 2374.588 0.129 0.511 0.807 0.083 0.000 0.056 0.000 0.228 0.116 72721.00 0.434 0.030 1689.000
02186 0.479 0.638 0.205 0.000 0.000 0.000 0.000 4239.814 0.156 0.508 0.428 0.387 0.000 0.057 0.000 0.152 0.210 79156.50 0.456 0.068 1644.667
02199 NA NA 2.189 0.203 0.777 0.926 0.245 16129.277 0.139 0.535 0.698 0.092 0.002 0.141 0.002 0.115 0.065 80416.03 0.571 0.080 2053.682
02203 NA NA 5.261 1.435 2.487 0.765 0.574 13100.245 0.140 0.497 0.822 0.030 0.000 0.105 0.001 0.082 0.072 108900.12 0.521 0.046 2448.625
02210 0.150 0.840 0.844 0.000 0.381 0.127 0.000 8138.914 0.069 0.393 0.795 0.073 0.000 0.053 0.000 0.108 0.085 127977.50 0.482 0.021 2311.800
02215 0.075 0.435 1.527 0.121 0.220 0.356 0.158 11140.257 0.125 0.551 0.623 0.080 0.002 0.173 0.000 0.114 0.057 46126.88 0.546 0.098 1924.625
02445 0.192 0.468 0.938 0.000 0.272 0.062 0.176 4333.078 0.241 0.572 0.792 0.047 0.002 0.076 0.000 0.105 0.065 98831.64 0.454 0.037 1776.500
02446 0.123 0.469 1.207 0.107 0.206 0.296 0.342 10636.854 0.076 0.544 0.623 0.062 0.003 0.202 0.000 0.118 0.064 53323.12 0.472 0.079 1776.500
02458 0.140 0.602 0.238 0.078 0.000 0.160 0.000 5845.176 0.093 0.510 0.739 0.057 0.000 0.121 0.002 0.106 0.088 95480.67 0.411 0.028 1983.167
02459 0.600 0.191 0.313 0.000 0.000 0.000 0.000 1566.452 0.165 0.503 0.800 0.041 0.000 0.084 0.000 0.057 0.129 112468.00 0.339 0.046 1773.000
02467 0.359 0.357 0.892 0.000 0.289 0.163 0.000 2470.987 0.165 0.542 0.810 0.033 0.000 0.085 0.000 0.102 0.105 106617.00 0.441 0.035 1735.000
02472 0.391 0.372 0.268 0.000 0.268 0.000 0.000 3377.791 0.088 0.497 0.601 0.072 0.000 0.207 0.000 0.122 0.096 90980.00 0.421 0.050 1896.000
02492 0.803 0.000 0.000 0.000 0.000 0.000 0.000 1566.452 0.165 0.503 0.800 0.041 0.000 0.084 0.000 0.057 0.129 112468.00 0.339 0.046 1773.000

To replicate Khushi’s excellent analysis with our new rescaled measures of connectivity, let’s write a code chunk that will output the results online, so we can refer to them when we need them.

stat_within <- mydat %>%
  summarize(
    # Get mean
    mean = mean(within, na.rm = TRUE),
    # Get standard deviation
    sd = sd(within, na.rm = TRUE),
    # Get number of valid responses
    nobs = sum(!is.na(within)),
    # Get smallest connectivity score (always 0)
    min = min(within, na.rm = TRUE),
    # Get 25th percentile
    p25 = quantile(within, probs = 0.25, na.rm = TRUE),
    # Get 50th percentile (median)
    p50 = quantile(within, probs = 0.50, na.rm = TRUE),
    # Get 75th percentile
    p75 = quantile(within, probs = 0.75, na.rm = TRUE),
    # Get max (always 1)
    max = max(within, na.rm = TRUE))

Let’s repeat for between-zipcode ties!

stat_between <- mydat %>%
  summarize(
    # Get mean
    mean = mean(between, na.rm = TRUE),
    # Get standard deviation
    sd = sd(between, na.rm = TRUE),
    # Get number of valid responses
    nobs = sum(!is.na(between)),
    # Get smallest connectivity score (always 0)
    min = min(between, na.rm = TRUE),
    # Get 25th percentile
    p25 = quantile(between, probs = 0.25, na.rm = TRUE),
    # Get 50th percentile (median)
    p50 = quantile(between, probs = 0.50, na.rm = TRUE),
    # Get 75th percentile
    p75 = quantile(between, probs = 0.75, na.rm = TRUE),
    # Get max (always 1)
    max = max(between, na.rm = TRUE))

Finally, let’s combine and reformat them for easy viewing…

mystat <- bind_rows(
  stat_within %>% mutate(ties = "within"), 
  stat_between %>% mutate(ties = "between")) %>%
  # Pivot longer, so that statistics get put all in the same column
  pivot_longer(cols = -c(ties), names_to = "variable", values_to = "value") %>%
  # Split up statistic column by type of ties
  pivot_wider(id_cols = variable, names_from = ties, values_from = value)

Let’s view them!

mystat
variable within between
mean 0.334 0.605
sd 0.229 0.254
nobs 50.000 50.000
min 0.000 0.000
p25 0.146 0.390
p50 0.338 0.623
p75 0.449 0.819
max 1.000 1.000

Finally, let’s find the zipcodes within our study region (core Boston) with the min, median, and max level of connectivity for each type of tie.

myorder <- bind_rows(
  
  mydat %>%
    select(geoid, value = between) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(26) %>%
    mutate(ties = "between", type = "median"),
  
  mydat %>%
    select(geoid, value = within) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(26) %>%
    mutate(ties = "within", type = "median"),
  
  mydat %>%
    select(geoid, value = between) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(1) %>%
    mutate(ties = "between", type = "min"),
  
  mydat %>%
    select(geoid, value = within) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(1) %>%
    mutate(ties = "within", type = "min"),
  
  
  
  mydat %>%
    select(geoid, value = between) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(50) %>%
    mutate(ties = "between", type = "max"),
  
  mydat %>%
    select(geoid, value = within) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(50) %>%
    mutate(ties = "within", type = "max")
  
) %>%
  arrange(ties, type) %>%
  select(ties, type, geoid, value)

Let’s view them in a nice table!

myorder
ties type geoid value
between max 02124 1.0000000
between median 02141 0.6248081
between min 02492 0.0000000
within max 02152 1.0000000
within median 02122 0.3397228
within min 02142 0.0000000

Finally, let’s make a few more visuals for the paper.

11. Map 1

library(tidyverse)
library(sf)
library(tigris)

mydat <- read_rds("raw_data/dataset.rds") %>%
  as_tibble() %>% select(-geometry) %>%
  # Because our variables are strongly right skewed, we need to log-transform them
  # However, a few cells report zero social infrastructure sites.
  # We're going to impute them with half the value of the smallest nonzero value,
  # and then log-transform, to maintain the general shape of the distribution
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~if_else(. == 0, true = sort(unique(.))[2] / 2, false =  .))) 

csub <- tigris::county_subdivisions(year = 2020, county = "Suffolk", state = "MA", cb = TRUE) %>%
  st_as_sf() %>%
  st_transform(crs = 4326) %>%
  filter(NAME == "Boston")

zip <- read_sf("raw_data/zipcodes.kml") %>% select(geoid, geometry) %>% st_as_sf()

# Get background
bg <- zip %>% st_union() %>% st_make_valid() %>% nngeo::st_remove_holes()

zipdata <- zip %>% left_join(by= "geoid", y = mydat)

fb <- read_csv("raw_data/boston_fb_sci.csv") %>% 
  mutate(scaled_sci = rescale(log(scaled_sci), to = c(0, 1)))  %>%
  left_join(by = c("from" = "geoid"), 
            y = points %>% as_tibble() %>% select(geoid, from_x = x, from_y = y)) %>%
  left_join(by = c("to" = "geoid"), 
            y = points %>% as_tibble() %>% select(geoid, to_x = x, to_y = y)) %>%
  filter(from_x != to_x, from_y != to_y)

g1 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = total), color = "white", size = 0.1) +
  geom_sf(data = csub, color = "black", size = 1.5, fill = NA) +
  labs(fill = "Rate of Total<br>Social<br>Infrastructure") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "white", trans = "log",
                      breaks = c(0.07, .36, 1, 5.26),
                        labels =c("0.07", ".36", "1","5.26")) +
  theme(legend.text = element_text(hjust = 0))

g2 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = between), color = "white", size = 0.1) +
  geom_sf(data = csub, color = "black", size = 1.5, fill = NA) +
  labs(fill = "Connectivity<br>Between<br>Zipcodes") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "#DC267F", na.value = "black",
                      breaks = c(0, 0.33, 0.66, 1), labels = c("0", ".33", ".66", "1")) +
  theme(legend.text = element_text(hjust = 0))

g3 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = within), color = "white", size = 0.1) +
  geom_sf(data = csub, color = "black", size = 1.5, fill = NA) +
  labs(fill = "Connectivity<br>Within<br>Same Zipcode") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "#648FFF", na.value = "black", 
                      breaks = c(0, 0.33, 0.66, 1), labels = c("0", ".33", ".66", "1"))+
  theme(legend.text = element_text(hjust = 0))

combo <- ggpubr::ggarrange(g1,g2,g3, nrow = 1, legend = "bottom")
ggsave(combo, filename = "raw_data/map.png", dpi = 500, width = 10, height = 4.5)

12. Map 2

allsites <- read_csv("raw_data/boston_social_infra_2022_03_03.csv") %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# And let's reimport the grid
grid <- read_sf("raw_data/grid_covariates_tracts.kml") %>%
  select(-c(Name:icon)) %>%
  st_join(csub %>% select(geometry), left = FALSE)


b1 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = pop_density),
          color = "white", size = 0.1) +
  geom_sf(data = grid, fill = NA, color = "darkgrey", size = 0.7) +
  geom_sf(data = grid, fill = NA, color = "white", size = 0.5) +
  geom_sf(data = csub, color = "white", size = 1.5, fill = NA) +
  geom_sf(data = csub, color = "black", size = 1, fill = NA) +
  geom_sf(data = allsites, shape = 21, color = "white", fill = "#785EF0", size = 1.5) +
  # Crop the map
  coord_sf(xlim = c(-71.2, -70.95),
           ylim = c(42.23, 42.40)) +
  labs(fill = "Population Density<br>per 1000 residents<br>per 1 km<sup>2</sup>") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "white", na.value = "black", 
                      trans = "log", guide = guide_colorbar(barwidth = 10, frame.colour = "#373737"),
                      breaks = c(1000, 2000, 3000, 6000, 10000, 16000, 30000),
                      labels = c("1", "2", "3", "6", "10", "16", "30")) +
  theme(legend.text = element_text(hjust = 0),
        legend.position = "bottom") 

# And let's view some distributions too, to demonstrate that they are right skewed.

b2 <- read_rds("raw_data/dataset.rds")  %>%
  pivot_longer(cols = c(total, community_spaces, social, places_of_worship, parks),
               names_to = "variable", values_to = "statistic") %>%
   mutate(variable = variable %>% factor() %>% recode_factor(
    "total" = "Total",
    "community_spaces" = "Community Spaces",
    "places_of_worship" = "Places of Worship",
    "social" = "Social Busineses",
    "parks" = "Parks")) %>%
  ggplot(mapping = aes(x = statistic, fill = variable)) +
  geom_density(mapping = aes(y = ..scaled..*100), fill = "#785EF0") +
  geom_text(mapping = aes(x = 2, y = 50, label = variable), hjust = 0, vjust = 0) +
  facet_grid(rows = vars(variable)) +
  guides(fill = "none") +
  viridis::scale_fill_viridis(discrete = TRUE, option = "plasma", begin = 0, end = 0.8) +
  scale_y_continuous(breaks = c(0, 50, 100), labels = c("0", "50", "100")) +
  scale_x_continuous(expand = expansion(add = c(0,0))) +
  theme_minimal(base_size = 14) +
  theme(panel.grid = element_blank(), 
        strip.background = element_blank(), strip.text = element_blank(),
        axis.line.y.left = element_line(color = "black"),
        panel.spacing.y = unit(0.5, "cm"),
        plot.margin = margin(0,0,0,0,"cm"),
        plot.subtitle = element_text(hjust = 0.5)) +
  labs(y = NULL,  x = "Average Zipcode Rate of Social Infrastructure\nper 1000 residents per 1 square kilometer",
       subtitle = "% of Zipcodes by Social Infrastructure")

combo <- ggpubr::ggarrange(b1,b2, nrow = 1, labels = c("A", "B"))
ggsave(combo, filename = "raw_data/map_distributions.png", dpi = 500, width = 9, height = 5)

13. Summary Statistics

z1 <- zipdata %>%
  na.omit() %>%
  as_tibble() %>%
  select(-geometry) %>%
  mutate(pop_density = pop_density / 1000,
         median_income = median_income / 1000) %>%
  pivot_longer(cols = c(pop_density:between), names_to = "variable", values_to = "value") %>%
  group_by(variable) %>%
  summarize(mean = mean(value, na.rm = TRUE) %>% round(3),
            sd = sd(value, na.rm = TRUE) %>% round(3),
            min = min(value, na.rm = TRUE) %>% round(3),
            median = median(value, na.rm = TRUE) %>% round(3),
            max = max(value, na.rm = TRUE) %>% round(3),
            nobs = sum(!is.na(value))) %>%
  ungroup() %>%
  pivot_longer(cols = c(mean, sd, min, median, max), names_to = "type", values_to = "stat") %>%
  mutate(group = case_when(variable %in% c("between", "within",
                                           "total", "community_spaces", "places_of_worship",
                                           "parks", "social") ~ "Main\nVariables",
                           TRUE ~ "Demographic\nVariables"),
         group = factor(group, levels = c("Main\nVariables", "Demographic\nVariables"))) %>%
  mutate(variable = variable %>% dplyr::recode_factor(
    "between" = "Ties Between Different Zipcodes",
    "within" = "Ties Within Same Zipcode",
    "total" = "Total Social Infrastructure Rate",
    "community_spaces" = "Community Spaces Rate",
    "places_of_worship" = "Places of Worship Rate",
    "social" = "Social Businesses Rate",
    "parks" = "Parks Rate",
    "pop_density" = "Population Density",
    "pop_women" = "% Women",
    "pop_age_65_plus" = "% Over Age 65",
    "pop_black" = "% Black",
    "pop_white" = "% White",
    "pop_hisplat" = "% Hispanic/Latino",
    "pop_asian" = "% Asian",
    "pop_natam" = "% Native American",
    "pop_pacific" = "% Pacific Islander",
    "pop_some_college" = "% Some College",
    "pop_unemployed" = "% Unemployed",
    "median_income" = "Median Income (1000s)",
    "median_monthly_housing_cost" =  "Median Monthly\nHousing Cost", 
    "income_inequality" = "Gini Coefficient") %>%
      factor(levels = levels(.) %>% rev())) %>%
  mutate(type = type %>% recode_factor(
    "mean" = "Mean",
    "sd" = "Std. Dev.",
    "min" = "Min",
    "median" = "Median",
    "max" = "Max")) %>%
  ggplot(mapping = aes(y = variable, x = type, label = stat)) +
  geom_tile(fill = NA, color = "grey") +
  geom_text() +
  facet_grid(rows = vars(group), scales = "free_y", shrink = TRUE, space = "free") +
  theme_classic(base_size = 14) +
  theme(strip.background = element_blank(),
        strip.text.y = element_text(hjust = 0, angle = 0)) +
  scale_x_discrete(position = "top", expand = expansion(add = c(0,0))) +
  labs(x = "Descriptive Statistics for Zipcodes (n = 50)",
       y = NULL) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        axis.ticks = element_blank())

ggsave(z1, filename = "raw_data/descriptives.png", dpi = 500, width = 7, height = 7.5)