1. Importing Data

Download a copy of this project here!

library(tidyverse)
library(sf)
library(scales)

Let’s import our key geographic data.

# Load in social infrastructure points
sites <- read_sf("raw_data/mygoogle.kml") %>%
  select(-c(Name:icon)) %>% # get rid of fluff
  select(group, geometry) # we only need these two columns

# Load in county polygon
county <- read_sf("raw_data/county.kml") %>%
  select(-c(Name:icon)) # get rid of fluff

# Load in zipcode polygons
zip <- read_sf("raw_data/zipcodes.kml") %>%
  select(-c(Name:icon))# get rid of fluff

# Load in grid
grid <- read_sf("raw_data/grid_covariates_tracts.kml") %>%
  select(-c(Name:icon)) # get rid of fluff

Let’s import our Facebook data!

fb <- read_csv("raw_data/boston_fb_sci.csv")

2. Viewing the Data

As our next step, let’s try to view some of this data!

First, let’s check what are the contents of each dataset!

Each row here is a social infrastructure site!

sites %>% head(2)

## Simple feature collection with 2 features and 1 field
## Geometry type: POINT
## Dimension:     XYZ
## Bounding box:  xmin: -71.10646 ymin: 42.31905 xmax: -71.045 ymax: 42.34176
## z_range:       zmin: 0 zmax: 0
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 2
##   group                            geometry
##   <chr>                         <POINT [°]>
## 1 Community Spaces   Z (-71.045 42.31905 0)
## 2 Community Spaces Z (-71.10646 42.34176 0)

Each row here is a county. (Just 1!)

county %>% head(2)

## Simple feature collection with 1 feature and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.19115 ymin: 42.22793 xmax: -70.9226 ymax: 42.45012
## Geodetic CRS:  WGS 84
## # A tibble: 1 × 3
##   geoid area_land                                                       geometry
##   <chr>     <dbl>                                             <MULTIPOLYGON [°]>
## 1 25025 150863059 (((-70.93091 42.3216, -70.93025 42.32229, -70.92597 42.32191,…

Each row here is a 1 km² grid cell!

grid %>% head(2)

## Simple feature collection with 2 features and 21 fields
## Geometry type: POLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.15658 ymin: 42.23028 xmax: -71.12166 ymax: 42.2548
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 22
##   cell_id neighborhood milestone       pop_density pop_women pop_white pop_black
##   <chr>   <chr>        <chr>                 <dbl>     <dbl>     <dbl>     <dbl>
## 1 Cell 1  Hyde Park    M4: Outer Bost…       2768.     0.551     0.411     0.466
## 2 Cell 10 Hyde Park    M4: Outer Bost…       3124.     0.531     0.401     0.410
## # … with 15 more variables: pop_natam <dbl>, pop_asian <dbl>,
## #   pop_pacific <dbl>, pop_hisplat <dbl>, pop_some_college <dbl>,
## #   employees_muni <dbl>, employees_state <dbl>, employees_fed <dbl>,
## #   median_income <dbl>, income_inequality <dbl>, pop_unemployed <dbl>,
## #   median_monthly_housing_cost <dbl>, pop_age_65_plus <dbl>,
## #   pop_density_int <dbl>, geometry <POLYGON [°]>

Each row here is a zipcode! The geoid column describes the zipcode.

zip %>% head(2)

## Simple feature collection with 2 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.0856 ymin: 42.34541 xmax: -71.05719 ymax: 42.36086
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 3
##   geoid area_land                                                       geometry
##   <chr>     <dbl>                                             <MULTIPOLYGON [°]>
## 1 02108    353358 (((-71.07522 42.35551, -71.07317 42.35646, -71.07231 42.3588,…
## 2 02199    148225 (((-71.0856 42.34788, -71.08375 42.34838, -71.07951 42.34951,…

Each row in fb is a pair of zipcodes, where scaled_sci shows how many Facebook friends connect those two zipcodes. For privacy, it’s been scaled from 1 to 1 billion.

fb %>% head(2)

## # A tibble: 2 × 3
##   from  to    scaled_sci
##   <chr> <chr>      <dbl>
## 1 02021 02021   15961499
## 2 02021 02026    1591400

Let’s try to visualize this data, on a map. Everything except for fb can be readily visualized on a map, using the ggplot() and geom_sf() functions. We’ll also want to crop the map using coord_sf() for readability.

ggplot() +
  # Map my zipcode boundaries
  geom_sf(data = zip) +
  # Map my points, using a nice shade of blue, with 50% transparency (alpha)
  geom_sf(data = sites, color = "steelblue", alpha = 0.5) +
  # Crop the map
  coord_sf(xlim = c(-71.14, -71.02),
           ylim = c(42.27, 42.38))

3. Statistics

We might also want to gather some statistics about our zipcodes.

For example… what is the average overall connectivity score for each zipcode? We can calculate this for each zipcode using the from variable as our zipcode id and the group_by(), summarize(), and mean() functions.

stat <- fb %>%
  group_by(from) %>%
  summarize(mean = mean(scaled_sci, na.rm = TRUE))

Note: it’s always important to add na.rm = TRUE to functions like mean, median, sd, max, etc., because it tells R to exclude cases where values are missing. Otherwise, we’ll just receive a mean of NA.

Let’s check out the first 2 rows of our result! Two suburban neighborhoods, around Dedham and Canton.

stat %>% head(2)

## # A tibble: 2 × 2
##   from     mean
##   <chr>   <dbl>
## 1 02021 887578.
## 2 02026 999484.

You might be wondering, is that a large number, compared to the rest of our sampled zipcodes? We can use summarize() to find out.

Okay, so there’s a REALLY BIG DIFFERENCE between some zipcodes’ connectivity here.

fb %>% 
  summarize(
    min = min(scaled_sci, na.rm = TRUE),
    mean = mean(scaled_sci, na.rm = TRUE),
    median = median(scaled_sci, na.rm = TRUE),
    max = max(scaled_sci, na.rm = TRUE),
    sd = sd(scaled_sci, na.rm = TRUE))

## # A tibble: 1 × 5
##     min    mean median      max       sd
##   <dbl>   <dbl>  <dbl>    <dbl>    <dbl>
## 1 69086 940896. 635246 42070495 1578778.

You could find out which zipcode pairs have the min and max connectivity scores, using filter().

fb %>%
  filter(scaled_sci == max(scaled_sci, na.rm = TRUE))

## # A tibble: 1 × 3
##   from  to    scaled_sci
##   <chr> <chr>      <dbl>
## 1 02152 02152   42070495

fb %>%
  filter(scaled_sci == min(scaled_sci, na.rm = TRUE))

## # A tibble: 2 × 3
##   from  to    scaled_sci
##   <chr> <chr>      <dbl>
## 1 02150 02492      69086
## 2 02492 02150      69086

And then you could google them to find out which neighborhoods these zipcodes pertain to.

I recommend that whenever you find something out, you just got it down as a note in your code script, so you have a written record of everything you do and thought in one place.

4. Visualizing our Statistics

Finally, we can take our data and visualize it on a map, as long as we first join together the data.frame of statistics with the geospatial data.frames we need.

Eg. if I want a map of zipcodes, shaded by mean connectivity, I need to get the mean connectivity of zipcodes in data.frame A, get the polygons in data.frame B, and join them together.

Earlier, we calculated the mean overall pairwise-connectivity for each zipcode, using the from column. We saved it in a data.frame called stat.

stat <- fb %>%
  group_by(from) %>%
  summarize(mean = mean(scaled_sci, na.rm = TRUE))

Let’s combine that with our zipcode polygons, saved in zip! zip records its zipcode ID in the geoid column; while stat records its zipcode ID in the from column.

We can join them together using left_join(), which adds in values from stat into our original zip data.frame. (Joins data from the right-side into the left-side dataset.)

zip2 <- zip %>% 
  left_join(by = c("geoid" = "from"), y = stat)

We can see zip2 now contains a column called mean!

zip2 %>%
  select(geoid, mean, geometry) %>%
  head(2)

## Simple feature collection with 2 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.0856 ymin: 42.34541 xmax: -71.05719 ymax: 42.36086
## Geodetic CRS:  WGS 84
## # A tibble: 2 × 3
##   geoid    mean                                                         geometry
##   <chr>   <dbl>                                               <MULTIPOLYGON [°]>
## 1 02108 750762. (((-71.07522 42.35551, -71.07317 42.35646, -71.07231 42.3588, -…
## 2 02199     NA  (((-71.0856 42.34788, -71.08375 42.34838, -71.07951 42.34951, -…

Finally, we can visualize it like this, telling ggplot to ‘map’ the data from the mean column in zip2 to be the fill (shading) for each polygon.

ggplot() +
  # My zipcodes
  geom_sf(data = zip2, mapping = aes(fill = mean)) +
  # Points  
  geom_sf(data = sites) +
  # Crop the map
  coord_sf(xlim = c(-71.14, -71.02),
           ylim = c(42.27, 42.38))

5. Rescaling

Next, we probably want to make a solution for those enormous social connectivity index numbers. In this case, we’re going to rescale them. First, we will log-transform them, which lessens the distance between increasingly large observations. And then, we will rescale that data so that the minimum logged observation becomes 0 and the max logged value becomes 1. This gives us a much nicer, much more natural measure of connectivity.

library(scales) # for rescaling
fb <- read_csv("raw_data/boston_fb_sci.csv") %>% 
  mutate(scaled_sci = rescale(log(scaled_sci), to = c(0, 1)))

Let’s see what it looks like now with summary() (different from summarize()).

fb$scaled_sci %>%  summary()

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.2888  0.3460  0.3564  0.4030  1.0000

Much easier to interpret!

6. Data Wrangling

Let’s import our dataset of all known social infrastructure points, both API generated and not. We’ll call it allsites.

library(tidyverse)
library(sf)

# We're going to import  
allsites <- read_csv("raw_data/boston_social_infra_2022_03_03.csv") %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# And let's reimport the grid
grid <- read_sf("raw_data/grid_covariates_tracts.kml") %>%
  select(-c(Name:icon))


ggplot() +
  geom_sf(data = grid)  +
  geom_sf(data = allsites)

Let’s make one single dataset, where each row is a zipcode, containing rates of social infrastructure and rates of connectivity.

First, we’ll get the rates of connectivity within zipcodes.

mywithin <- fb %>%
  filter(from == to) %>%
  # For each zipcode, please get the total ties to residents within the same zipcode
  group_by(zip = from) %>%
  summarize(within = sum(scaled_sci, na.rm = TRUE)) %>%
  ungroup() %>%
  # Then rescale that from 0 to 1
  mutate(within = within %>% rescale(to = c(0, 1)))

mywithin %>% head(3)

## # A tibble: 3 × 2
##   zip   within
##   <chr>  <dbl>
## 1 02021 0.686 
## 2 02026 0.612 
## 3 02108 0.0623

Second, we’ll get the rates of connectivity between zipcodes.

mybetween <- fb %>%
  filter(from != to) %>%
  # For each zipcode, please get the total ties to other zipcodes
  group_by(zip = from) %>%
  summarize(between = sum(scaled_sci, na.rm = TRUE)) %>%
  ungroup() %>%
  # Then rescale that from 0 to 1
  mutate(between = between %>% rescale(to = c(0, 1)))

mybetween %>% head(3)

## # A tibble: 3 × 2
##   zip   between
##   <chr>   <dbl>
## 1 02021   0.296
## 2 02026   0.534
## 3 02108   0.622

Next, we’ll tally up social infrastructure sites.

allsites %>% 
  as_tibble() %>% 
  group_by(cell_id, type) %>%
  summarize(count = n()) %>%
  head()

## # A tibble: 6 × 3
## # Groups:   cell_id [2]
##   cell_id  type              count
##   <chr>    <chr>             <int>
## 1 Cell 10  Parks                 4
## 2 Cell 10  Places of Worship     2
## 3 Cell 10  Social Businesses     1
## 4 Cell 100 Community Spaces      1
## 5 Cell 100 Parks                 1
## 6 Cell 100 Social Businesses     1

There’s a faster, albeit a little less clean way to do this. Let’s count up social infrastructure by type, join in cell population density, and get the rate of sites per 1000 residents.

myrates <- allsites %>%
  as_tibble() %>%
  group_by(cell_id) %>%
  summarize(
    total = sum(!is.na(type), na.rm = TRUE),
    community_spaces = sum(type == "Community Spaces", na.rm = TRUE),
    parks = sum(type == "Parks", na.rm = TRUE),
    social = sum(type == "Social Businesses", na.rm = TRUE),
    places_of_worship = sum(type == "Places of Worship", na.rm = TRUE)) %>%
  # Join in population density
  left_join(by = "cell_id", 
            y = grid %>% as_tibble() %>% 
              select(cell_id, pop_density_int)) %>%
  # Calculate rate of social infrastructure per 1000 residents per square kilometer
  mutate(
    total = total / pop_density_int * 1000,
    community_spaces = community_spaces / pop_density_int * 1000,
    parks = parks / pop_density_int * 1000,
    social = social / pop_density_int * 1000,
    places_of_worship = places_of_worship / pop_density_int * 1000) %>%
  # Drop excess variables
  select(-pop_density_int) %>%
  # Join rates into grid
  right_join(by = "cell_id", y = grid) %>%
  # Then fill in missing values with zero, because it means there were no sites there, in this case
  mutate_at(vars(total, community_spaces, parks,social, places_of_worship),
                list(~if_else(is.na(.), 0, .))) %>%
  # Turn back into an sf object
  st_as_sf(crs = 4326) 

# Here are the variables in this one.
myrates %>% names()

##  [1] "cell_id"                     "total"                      
##  [3] "community_spaces"            "parks"                      
##  [5] "social"                      "places_of_worship"          
##  [7] "neighborhood"                "milestone"                  
##  [9] "pop_density"                 "pop_women"                  
## [11] "pop_white"                   "pop_black"                  
## [13] "pop_natam"                   "pop_asian"                  
## [15] "pop_pacific"                 "pop_hisplat"                
## [17] "pop_some_college"            "employees_muni"             
## [19] "employees_state"             "employees_fed"              
## [21] "median_income"               "income_inequality"          
## [23] "pop_unemployed"              "median_monthly_housing_cost"
## [25] "pop_age_65_plus"             "pop_density_int"            
## [27] "geometry"

mydat <- myrates %>%
  # Join in zipcode ID
  st_join(zip, left = TRUE) %>%
  # For each zipcode,
  group_by(geoid) %>%
  # Calculate the median level of concept X in a city block in that zipcode.
  summarize_at(vars(pop_density, total, community_spaces, parks,social, places_of_worship,
                     pop_age_65_plus, pop_women:pop_some_college, 
                    median_income, income_inequality, pop_unemployed, median_monthly_housing_cost),
               list(~median(., na.rm = TRUE))) %>%
  # Then join in our zipcode FB measures
  left_join(by = c("geoid" = "zip"), y = mywithin) %>%
  left_join(by = c("geoid" = "zip"), y = mybetween)

Finally, let’s trim this dataset, by zooming into just the zipcodes located within the core Boston zone. Let’s examine.

# Load in zipcode polygons
zip <- read_sf("raw_data/zipcodes.kml") %>%
  select(-c(Name:icon))# get rid of fluff

ggplot() +
  geom_sf(data = zip %>%
            st_join(myrates, left = FALSE), color = "blue", fill = "grey") +
  geom_sf(data = myrates, fill = NA)

Let’s save this file as an .rds file, in R formatting, so we can easily access it without breaking up the geometry.

mydat %>%
  write_rds("raw_data/dataset.rds")

And let’s get rid of the rest of the data here.

rm(list = ls())

7. Correlation

Next, we’re going to investigate the correlations between social infrastructure and social connectivity.

Let’s import our data.

mydat <- read_rds("raw_data/dataset.rds") %>%
  # Because our variables are strongly right skewed, we need to log-transform them
  # However, a few cells report zero social infrastructure sites.
  # We're going to impute them with half the value of the smallest nonzero value,
  # and then log-transform, to maintain the general shape of the distribution
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~if_else(. == 0, true = sort(unique(.))[2] / 2, false =  .))) %>%
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~log(.)))

Each time we calculate a statistic, we need to convert to tibble first, and then run our calculation, because otherwise the mapping-formatting messes it up.

bind_rows(
  mydat %>%
    as_tibble() %>%
    summarize_at(vars(total:median_monthly_housing_cost), 
                 list(~cor(x = ., within, use = "pairwise.complete.obs"))) %>%
    mutate(ties = "Within Zipcode"),
  
  mydat %>%
    as_tibble() %>%
    summarize_at(vars(total:median_monthly_housing_cost), 
                 list(~cor(x = ., between, use = "pairwise.complete.obs"))) %>%
    mutate(ties = "Different Zipcodes")
) %>%
  pivot_longer(cols = c(total:median_monthly_housing_cost),
               names_to = "variable", values_to = "statistic") %>%
  write_csv("raw_data/correlations.csv")

Wow! That was fast!

8. Visualizing Correlations

Let’s find some clever ways to intuitively visualize this data.

mycor <- read_csv("raw_data/correlations.csv")

Let’s check out our values!

mycor

ties	variable	statistic
Within Zipcode	total	-0.545
Within Zipcode	community_spaces	-0.238
Within Zipcode	parks	-0.476
Within Zipcode	social	-0.603
Within Zipcode	places_of_worship	-0.273
Within Zipcode	pop_age_65_plus	0.012
Within Zipcode	pop_women	-0.044
Within Zipcode	pop_white	-0.153
Within Zipcode	pop_black	0.181
Within Zipcode	pop_natam	-0.085
Within Zipcode	pop_asian	-0.194
Within Zipcode	pop_pacific	-0.231
Within Zipcode	pop_hisplat	0.182
Within Zipcode	pop_some_college	0.538
Within Zipcode	median_income	-0.169
Within Zipcode	income_inequality	-0.472
Within Zipcode	pop_unemployed	0.075
Within Zipcode	median_monthly_housing_cost	-0.427
Different Zipcodes	total	0.708
Different Zipcodes	community_spaces	0.399
Different Zipcodes	parks	0.681
Different Zipcodes	social	0.662
Different Zipcodes	places_of_worship	0.644
Different Zipcodes	pop_age_65_plus	-0.041
Different Zipcodes	pop_women	-0.053
Different Zipcodes	pop_white	-0.408
Different Zipcodes	pop_black	0.491
Different Zipcodes	pop_natam	0.470
Different Zipcodes	pop_asian	-0.190
Different Zipcodes	pop_pacific	0.078
Different Zipcodes	pop_hisplat	0.014
Different Zipcodes	pop_some_college	0.039
Different Zipcodes	median_income	-0.057
Different Zipcodes	income_inequality	0.536
Different Zipcodes	pop_unemployed	0.342
Different Zipcodes	median_monthly_housing_cost	0.004

First, let’s look at social infrastructure.

mycor %>%
  filter(variable %in% c("total", "parks", "community_spaces", 
                         "places_of_worship", "social")) %>%
  ggplot(mapping = aes(x = ties, y = variable, 
                       fill = statistic, label = round(statistic, 2))) +
  geom_tile() +
  geom_text(color = "white", fontface = "bold", size = 5) +
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)

Now, using Khushi’s excellent visualization design, let’s convert this into a barchart.

mycor %>%
  filter(variable %in% c("total", "social", "community_spaces",
                         "places_of_worship", "parks")) %>%
  mutate(variable = variable %>% factor() %>% recode_factor(
    "total" = "Total",
    "community_spaces" = "Community\nSpaces",
    "places_of_worship" = "Places\nof Worship",
    "social" = "Social\nBusineses",
    "parks" = "Parks"),
    textposition = if_else(ties == "Different Zipcodes",
                                      true = statistic + 0.1, false = statistic - 0.1)) %>%
  ggplot(mapping = aes(x = variable, y = statistic, fill = ties, label = round(statistic, 2))) +
  geom_col() +
  geom_text(mapping = aes(y = textposition)) +
  scale_fill_manual(values = c("#648FFF", "#DC267F"), name = NULL) +
  ylim(c(-1, 1)) +
  theme_minimal(base_size = 14) +
  theme(legend.position = c(0.20, 0.99),
        panel.grid = element_blank(),
        axis.line.y.left = element_line(color = "black", size= 0.5)) +
  labs(x = "Type of Social Infrastructure", 
       y = "Correlation (-1 to 1)\n(between Ties and Social Infrastructure)")

Next, let’s look at demographics.

mycor %>% 
  filter(!variable %in% c("total", "parks", "community_spaces", 
                          "places_of_worship", "social")) %>%
  ggplot(mapping = aes(x = ties, y = variable, 
                       fill = statistic, label = round(statistic, 2))) +
  geom_tile() +
  geom_text(color = "black") +
  scale_fill_gradient2(low = "red", mid = "white", high = "blue", midpoint = 0)

Using Khushi’s visualization code, let’s convert this into a bar chart too!

mycor %>%
  filter(!variable %in% c("total", "parks", "community_spaces", 
                          "places_of_worship", "social")) %>%
  mutate(variable = variable %>% factor() %>%
           recode_factor(
             "pop_women" = "% Women",
             "pop_age_65_plus" = "% Over Age 65",
             "pop_black" = "% Black",
             "pop_white" = "% White",
             "pop_hisplat" = "% Hispanic/Latino",
             "pop_asian" = "% Asian",
             "pop_natam" = "% Native American",
             "pop_pacific" = "% Pacific Islander",
             "pop_some_college" = "% Some College",
             "pop_unemployed" = "% Unemployed",
             "median_income" = "Median Income",
             "median_monthly_housing_cost" =  "Median Monthly\nHousing Cost", 
             "income_inequality" = "Gini Coefficient") %>%
           factor(levels = levels(.) %>% rev()),
         statistic = round(statistic, 2)) %>%
  ggplot(mapping = aes(x = variable,
                       y = statistic,
                       color = ties,
                       fill = ties,
                       label = statistic)) +
  geom_point(size = 3, shape = 21, color = "white") +
  geom_segment(
    mapping = aes(x = variable, xend = variable, 
                  y = 0, yend = statistic),
    position = "dodge") +
  geom_text(data = . %>%
              filter(ties == "Different Zipcodes"),
            mapping = aes(label = statistic),
            nudge_x = 0.5) +
  geom_text(data = . %>%
              filter(ties == "Within Zipcode"),
            mapping = aes(label = statistic),
            nudge_x = -0.5) +
  geom_hline(yintercept = 0) +
  scale_fill_manual(values = c("#648FFF", "#DC267F") %>% rev(), name = NULL) +
  scale_color_manual(values = c("#648FFF", "#DC267F") %>% rev(), name = NULL) +
  scale_x_discrete(expand = expansion(add = c(1, 1))) +
  coord_flip() +
  theme_bw(base_size = 14) + 
  theme(legend.position = "bottom",
        panel.grid.minor.y = element_blank(),
        panel.grid.major.y = element_blank()) +
  labs(x = NULL,
       y = "Correlation (-1 to 1)\n(between Social Infrastructure and Ties)",
       fill = NULL, color = NULL)

# 9. Scatterplot

mydat <- read_rds("raw_data/dataset.rds") %>%
  # Because our variables are strongly right skewed, we need to log-transform them
  # However, a few cells report zero social infrastructure sites.
  # We're going to impute them with half the value of the smallest nonzero value,
  # and then log-transform, to maintain the general shape of the distribution
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~if_else(. == 0, true = sort(unique(.))[2] / 2, false =  .)))

Let’s gather some very simple models too.

mw <- mydat %>%
    lm(formula = within ~ log(total))

mb <- mydat %>%
    lm(formula = between ~ log(total))
    
library(broom)

# Now, let's gather some basic data about our simple models,
# include the R-squard, fstatistic, p-value, and sigma
mystat <- bind_rows(
  tidy(mw) %>%
    filter(term == "log(total)") %>%
    bind_cols(
      mw %>% glance() %>% 
        select(r = r.squared, fstat = statistic, fstat_p = p.value, sigma)) %>%
    mutate(variable = "Within Zipcode"),
  
  tidy(mb) %>%
    filter(term == "log(total)") %>%
    bind_cols(
      mb %>% glance() %>% 
        select(r = r.squared, fstat = statistic, fstat_p = p.value, sigma)) %>%
    mutate(variable = "Different Zipcodes")
) %>%
  mutate(estimate = round(estimate, 2),
         statistic = round(statistic, 2),
         plab = round(p.value, 3),
         plab = if_else(plab == "0", "p < 0.001", paste("p = ", plab, sep = "")),
         r = round(r, 2),
         sigma = round(sigma, 2),
         fstat = round(fstat, 2),
         
         label = paste("Beta = ", estimate, gtools::stars.pval(p.value), 
                       "\n", "Variation Explained: ", r*100,"%", "\n", 
                       "Avg. Error: ", sigma, sep = ""))

mydat %>%
  pivot_longer(cols = c(within, between), 
               names_to = "variable", values_to = "value") %>%
  mutate(variable = variable %>% recode_factor(
    "between" = "Different Zipcodes",
    "within" = "Within Zipcode")) %>%
  ggplot(mapping = aes(x = total, y = value, fill = variable, color = variable, group = variable)) +
  geom_point(shape = 21, size = 5, color = 'white') +
  geom_smooth(method = "lm", formula = y ~ log(x), color = "white") +
  shadowtext::geom_shadowtext(data = mystat, mapping = aes(group = variable, x = 0.5, y = c(0.75, 0.25), label = label),
                              bg.r = 0.2, bg.color = "white", color = "#373737", hjust = 0, fontface = "bold") +
  facet_wrap(~variable) +
  theme_classic(base_size = 14) +
  scale_fill_manual(values = c("#648FFF", "#DC267F")) +
  guides(fill = "none", color = "none") +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        panel.spacing.x = unit(0.5, "cm"),
        strip.background = element_blank(),
        panel.border = element_rect(color = "#373737", fill = NA),
        axis.line = element_blank()) +
  scale_x_continuous(expand = expansion(add = c(0.25, -0.5))) +
  labs(caption = "<b>Note</b>: Based on simple bivariate OLS regression model.
       <br><b>Model Equation</b>: Social Connectivity ~ log(Social Infrastructure Rate)
       <br>*<b>Rate</b>: Zipcode-averaged rate of social infrastructure, per 1000 residents, per 1 km<sup>2</sup>.",
       x = "Rate of Total Social Infrastructure*",
       y = "Social Connectivity (0 to 1)")

## Warning: Removed 6 rows containing non-finite values (stat_smooth).

## Warning: Removed 6 rows containing missing values (geom_point).

10. Data Revisited

Also, for context, what did the raw dataset look like? Let’s check it out.

# Import raw data
mydat <- read_rds("raw_data/dataset.rds") %>%
    # Convert to data.frame/tibble for easy reading
  as_tibble() %>%
  select(geoid, 
         # Add social connectivity
         within, between, 
         # Add social infrastructure
         total:places_of_worship, 
         # Add Demographics
         pop_density, pop_age_65_plus:median_monthly_housing_cost)

mydat

geoid	within	between	total	community_spaces	parks	social	places_of_worship	pop_density	pop_age_65_plus	pop_women	pop_white	pop_black	pop_natam	pop_asian	pop_pacific	pop_hisplat	pop_some_college	median_income	income_inequality	pop_unemployed	median_monthly_housing_cost
02021	0.686	0.296	0.000	0.000	0.000	0.000	0.000	1855.807	0.127	0.490	0.456	0.292	0.000	0.072	0.000	0.214	0.257	81063.00	0.370	0.036	1485.000
02026	0.612	0.534	0.406	0.000	0.222	0.000	0.000	2633.603	0.165	0.538	0.505	0.292	0.000	0.078	0.000	0.169	0.182	76007.50	0.424	0.036	1473.000
02108	0.062	0.622	4.370	0.507	1.820	0.611	0.498	12346.511	0.148	0.519	0.790	0.030	0.001	0.126	0.000	0.074	0.058	108540.36	0.554	0.043	2348.825
02109	0.247	0.886	3.089	0.119	1.590	0.446	0.345	13100.245	0.117	0.461	0.788	0.073	0.001	0.084	0.001	0.243	0.083	105463.36	0.517	0.044	2224.462
02110	0.019	0.641	3.798	0.197	1.731	0.617	0.289	10060.986	0.117	0.465	0.768	0.049	0.001	0.116	0.001	0.138	0.071	105463.36	0.520	0.046	2224.462
02111	0.117	0.810	3.494	0.299	1.897	0.499	0.399	10060.986	0.140	0.497	0.739	0.030	0.000	0.167	0.000	0.082	0.072	108900.12	0.521	0.053	2159.500
02113	NA	NA	5.258	0.837	2.498	1.159	0.765	13100.245	0.130	0.486	0.821	0.049	0.001	0.084	0.001	0.138	0.083	105463.36	0.517	0.037	2224.462
02114	0.075	0.779	3.479	0.239	1.739	0.458	0.574	12346.511	0.142	0.510	0.822	0.045	0.000	0.091	0.000	0.083	0.072	108900.12	0.512	0.034	2355.200
02115	0.079	0.644	2.030	0.164	0.916	0.530	0.205	13733.234	0.133	0.539	0.610	0.150	0.002	0.151	0.001	0.115	0.062	53537.43	0.546	0.098	1776.500
02116	0.049	0.783	2.693	0.252	1.094	0.499	0.395	12507.108	0.144	0.528	0.711	0.053	0.002	0.148	0.000	0.092	0.061	97113.46	0.577	0.062	2100.727
02118	0.233	0.938	2.672	0.231	1.143	0.382	0.368	10973.122	0.109	0.514	0.542	0.200	0.003	0.110	0.000	0.209	0.097	62586.51	0.547	0.093	1467.006
02119	0.409	0.965	3.330	0.392	1.196	0.399	0.712	8357.402	0.113	0.523	0.278	0.476	0.006	0.041	0.000	0.286	0.141	40614.00	0.523	0.113	1231.333
02120	0.401	0.857	2.693	0.369	1.308	0.449	0.561	9356.877	0.116	0.535	0.538	0.208	0.003	0.122	0.000	0.186	0.087	48611.25	0.506	0.099	1408.500
02121	0.455	0.970	2.426	0.194	1.077	0.301	0.989	8451.741	0.111	0.502	0.188	0.586	0.007	0.022	0.000	0.291	0.166	40348.27	0.507	0.140	1288.000
02122	0.340	0.852	1.103	0.123	0.473	0.247	0.239	6086.799	0.117	0.517	0.561	0.222	0.001	0.088	0.000	0.124	0.144	80114.60	0.455	0.063	1685.000
02124	0.394	1.000	1.534	0.136	0.348	0.244	0.362	6159.891	0.123	0.520	0.302	0.499	0.003	0.071	0.000	0.140	0.168	64466.46	0.460	0.082	1672.500
02125	0.252	0.878	1.418	0.150	0.600	0.247	0.279	8084.176	0.104	0.499	0.404	0.259	0.002	0.109	0.000	0.232	0.116	62040.50	0.506	0.079	1493.800
02126	0.442	0.950	0.850	0.000	0.152	0.153	0.425	5377.362	0.139	0.521	0.171	0.691	0.003	0.012	0.000	0.181	0.198	61934.20	0.467	0.102	1670.667
02127	0.452	0.924	0.750	0.124	0.567	0.114	0.127	8031.003	0.077	0.431	0.795	0.073	0.000	0.053	0.000	0.192	0.090	114139.40	0.490	0.029	2008.000
02128	0.340	0.542	0.591	0.000	0.228	0.000	0.000	3899.365	0.114	0.465	0.694	0.039	0.001	0.046	0.002	0.415	0.120	70281.20	0.465	0.052	1647.667
02129	0.451	0.822	0.523	0.000	0.337	0.061	0.000	4137.486	0.111	0.481	0.830	0.061	0.000	0.069	0.000	0.096	0.092	116227.33	0.489	0.022	2064.400
02130	0.145	0.780	1.382	0.298	0.933	0.127	0.428	4679.716	0.149	0.542	0.645	0.172	0.003	0.045	0.000	0.197	0.115	88313.98	0.448	0.090	1830.792
02131	0.296	0.804	0.948	0.000	0.239	0.214	0.372	4475.363	0.127	0.518	0.598	0.256	0.003	0.023	0.000	0.246	0.182	79330.00	0.436	0.077	1688.500
02132	0.428	0.695	1.048	0.000	0.349	0.237	0.237	2492.811	0.193	0.531	0.800	0.041	0.000	0.082	0.000	0.083	0.138	99105.00	0.404	0.041	1688.500
02134	0.128	0.561	0.811	0.000	0.061	0.227	0.069	4988.269	0.049	0.513	0.600	0.070	0.002	0.220	0.000	0.132	0.076	68228.90	0.475	0.056	1920.500
02135	0.090	0.651	1.048	0.055	0.283	0.288	0.000	6285.313	0.090	0.513	0.698	0.059	0.001	0.152	0.000	0.119	0.093	78681.36	0.445	0.035	1846.357
02136	0.329	0.880	0.840	0.000	0.234	0.217	0.293	3478.778	0.131	0.520	0.411	0.437	0.000	0.034	0.000	0.246	0.206	76628.75	0.437	0.076	1594.500
02138	0.502	0.329	0.134	0.000	0.134	0.000	0.000	2651.759	0.062	0.505	0.601	0.077	0.001	0.213	0.000	0.122	0.080	74388.25	0.475	0.065	1987.167
02139	0.345	0.596	0.296	0.069	0.090	0.069	0.069	4988.269	0.038	0.537	0.623	0.063	0.002	0.220	0.000	0.121	0.041	61839.33	0.548	0.096	2092.667
02141	0.149	0.625	0.848	0.000	0.636	0.141	0.000	4274.837	0.114	0.487	0.862	0.048	0.000	0.052	0.000	0.061	0.083	131138.50	0.431	0.016	2461.100
02142	0.000	0.752	1.695	0.000	1.086	0.197	0.000	12185.914	0.142	0.510	0.838	0.045	0.000	0.075	0.000	0.083	0.062	119714.00	0.507	0.034	2415.250
02143	0.200	0.516	0.280	0.000	0.093	0.186	0.000	3482.079	0.088	0.476	0.869	0.053	0.000	0.041	0.000	0.045	0.083	136980.75	0.405	0.014	2347.750
02145	0.301	0.539	0.487	0.000	0.186	0.000	0.000	2363.138	0.086	0.465	0.885	0.052	0.000	0.029	0.000	0.039	0.074	142563.00	0.378	0.013	2567.000
02148	0.176	0.353	0.000	0.000	0.000	0.000	0.000	2374.588	0.129	0.511	0.807	0.083	0.000	0.056	0.000	0.228	0.116	72721.00	0.434	0.030	1689.000
02149	0.337	0.338	0.000	0.000	0.000	0.000	0.000	3673.953	0.126	0.481	0.693	0.079	0.000	0.039	0.000	0.370	0.136	61254.50	0.427	0.049	1581.000
02150	0.509	0.339	0.000	0.000	0.000	0.000	0.000	7115.202	0.115	0.479	0.590	0.065	0.001	0.041	0.000	0.519	0.134	58841.40	0.443	0.056	1496.750
02151	0.346	0.380	0.000	0.000	0.000	0.000	0.000	3372.973	0.144	0.505	0.788	0.055	0.000	0.052	0.000	0.279	0.143	61658.00	0.462	0.053	1499.375
02152	1.000	0.420	0.000	0.000	0.000	0.000	0.000	3820.653	0.139	0.513	0.892	0.023	0.001	0.009	0.000	0.137	0.204	76974.00	0.434	0.050	1571.333
02163	0.955	0.147	0.977	0.293	0.391	0.098	0.195	3286.801	0.049	0.513	0.604	0.074	0.002	0.218	0.000	0.128	0.059	70879.08	0.483	0.087	2092.667
02171	0.432	0.376	0.000	0.000	0.000	0.000	0.000	3270.169	0.077	0.448	0.770	0.174	0.000	0.070	0.000	0.292	0.115	90893.00	0.419	0.040	1731.333
02176	0.613	0.082	0.000	0.000	0.000	0.000	0.000	2374.588	0.129	0.511	0.807	0.083	0.000	0.056	0.000	0.228	0.116	72721.00	0.434	0.030	1689.000
02186	0.479	0.638	0.205	0.000	0.000	0.000	0.000	4239.814	0.156	0.508	0.428	0.387	0.000	0.057	0.000	0.152	0.210	79156.50	0.456	0.068	1644.667
02199	NA	NA	2.189	0.203	0.777	0.926	0.245	16129.277	0.139	0.535	0.698	0.092	0.002	0.141	0.002	0.115	0.065	80416.03	0.571	0.080	2053.682
02203	NA	NA	5.261	1.435	2.487	0.765	0.574	13100.245	0.140	0.497	0.822	0.030	0.000	0.105	0.001	0.082	0.072	108900.12	0.521	0.046	2448.625
02210	0.150	0.840	0.844	0.000	0.381	0.127	0.000	8138.914	0.069	0.393	0.795	0.073	0.000	0.053	0.000	0.108	0.085	127977.50	0.482	0.021	2311.800
02215	0.075	0.435	1.527	0.121	0.220	0.356	0.158	11140.257	0.125	0.551	0.623	0.080	0.002	0.173	0.000	0.114	0.057	46126.88	0.546	0.098	1924.625
02445	0.192	0.468	0.938	0.000	0.272	0.062	0.176	4333.078	0.241	0.572	0.792	0.047	0.002	0.076	0.000	0.105	0.065	98831.64	0.454	0.037	1776.500
02446	0.123	0.469	1.207	0.107	0.206	0.296	0.342	10636.854	0.076	0.544	0.623	0.062	0.003	0.202	0.000	0.118	0.064	53323.12	0.472	0.079	1776.500
02458	0.140	0.602	0.238	0.078	0.000	0.160	0.000	5845.176	0.093	0.510	0.739	0.057	0.000	0.121	0.002	0.106	0.088	95480.67	0.411	0.028	1983.167
02459	0.600	0.191	0.313	0.000	0.000	0.000	0.000	1566.452	0.165	0.503	0.800	0.041	0.000	0.084	0.000	0.057	0.129	112468.00	0.339	0.046	1773.000
02467	0.359	0.357	0.892	0.000	0.289	0.163	0.000	2470.987	0.165	0.542	0.810	0.033	0.000	0.085	0.000	0.102	0.105	106617.00	0.441	0.035	1735.000
02472	0.391	0.372	0.268	0.000	0.268	0.000	0.000	3377.791	0.088	0.497	0.601	0.072	0.000	0.207	0.000	0.122	0.096	90980.00	0.421	0.050	1896.000
02492	0.803	0.000	0.000	0.000	0.000	0.000	0.000	1566.452	0.165	0.503	0.800	0.041	0.000	0.084	0.000	0.057	0.129	112468.00	0.339	0.046	1773.000

To replicate Khushi’s excellent analysis with our new rescaled measures of connectivity, let’s write a code chunk that will output the results online, so we can refer to them when we need them.

stat_within <- mydat %>%
  summarize(
    # Get mean
    mean = mean(within, na.rm = TRUE),
    # Get standard deviation
    sd = sd(within, na.rm = TRUE),
    # Get number of valid responses
    nobs = sum(!is.na(within)),
    # Get smallest connectivity score (always 0)
    min = min(within, na.rm = TRUE),
    # Get 25th percentile
    p25 = quantile(within, probs = 0.25, na.rm = TRUE),
    # Get 50th percentile (median)
    p50 = quantile(within, probs = 0.50, na.rm = TRUE),
    # Get 75th percentile
    p75 = quantile(within, probs = 0.75, na.rm = TRUE),
    # Get max (always 1)
    max = max(within, na.rm = TRUE))

Let’s repeat for between-zipcode ties!

stat_between <- mydat %>%
  summarize(
    # Get mean
    mean = mean(between, na.rm = TRUE),
    # Get standard deviation
    sd = sd(between, na.rm = TRUE),
    # Get number of valid responses
    nobs = sum(!is.na(between)),
    # Get smallest connectivity score (always 0)
    min = min(between, na.rm = TRUE),
    # Get 25th percentile
    p25 = quantile(between, probs = 0.25, na.rm = TRUE),
    # Get 50th percentile (median)
    p50 = quantile(between, probs = 0.50, na.rm = TRUE),
    # Get 75th percentile
    p75 = quantile(between, probs = 0.75, na.rm = TRUE),
    # Get max (always 1)
    max = max(between, na.rm = TRUE))

Finally, let’s combine and reformat them for easy viewing…

mystat <- bind_rows(
  stat_within %>% mutate(ties = "within"), 
  stat_between %>% mutate(ties = "between")) %>%
  # Pivot longer, so that statistics get put all in the same column
  pivot_longer(cols = -c(ties), names_to = "variable", values_to = "value") %>%
  # Split up statistic column by type of ties
  pivot_wider(id_cols = variable, names_from = ties, values_from = value)

Let’s view them!

mystat

variable	within	between
mean	0.334	0.605
sd	0.229	0.254
nobs	50.000	50.000
min	0.000	0.000
p25	0.146	0.390
p50	0.338	0.623
p75	0.449	0.819
max	1.000	1.000

Finally, let’s find the zipcodes within our study region (core Boston) with the min, median, and max level of connectivity for each type of tie.

myorder <- bind_rows(
  
  mydat %>%
    select(geoid, value = between) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(26) %>%
    mutate(ties = "between", type = "median"),
  
  mydat %>%
    select(geoid, value = within) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(26) %>%
    mutate(ties = "within", type = "median"),
  
  mydat %>%
    select(geoid, value = between) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(1) %>%
    mutate(ties = "between", type = "min"),
  
  mydat %>%
    select(geoid, value = within) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(1) %>%
    mutate(ties = "within", type = "min"),
  
  
  
  mydat %>%
    select(geoid, value = between) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(50) %>%
    mutate(ties = "between", type = "max"),
  
  mydat %>%
    select(geoid, value = within) %>% 
    filter(!is.na(value)) %>%
    arrange(value) %>%
    slice(50) %>%
    mutate(ties = "within", type = "max")
  
) %>%
  arrange(ties, type) %>%
  select(ties, type, geoid, value)

Let’s view them in a nice table!

myorder

ties	type	geoid	value
between	max	02124	1.0000000
between	median	02141	0.6248081
between	min	02492	0.0000000
within	max	02152	1.0000000
within	median	02122	0.3397228
within	min	02142	0.0000000

Finally, let’s make a few more visuals for the paper.

11. Map 1

library(tidyverse)
library(sf)
library(tigris)

mydat <- read_rds("raw_data/dataset.rds") %>%
  as_tibble() %>% select(-geometry) %>%
  # Because our variables are strongly right skewed, we need to log-transform them
  # However, a few cells report zero social infrastructure sites.
  # We're going to impute them with half the value of the smallest nonzero value,
  # and then log-transform, to maintain the general shape of the distribution
  mutate_at(vars(total, social, parks, places_of_worship),
            list(~if_else(. == 0, true = sort(unique(.))[2] / 2, false =  .))) 

csub <- tigris::county_subdivisions(year = 2020, county = "Suffolk", state = "MA", cb = TRUE) %>%
  st_as_sf() %>%
  st_transform(crs = 4326) %>%
  filter(NAME == "Boston")

zip <- read_sf("raw_data/zipcodes.kml") %>% select(geoid, geometry) %>% st_as_sf()

# Get background
bg <- zip %>% st_union() %>% st_make_valid() %>% nngeo::st_remove_holes()

zipdata <- zip %>% left_join(by= "geoid", y = mydat)

fb <- read_csv("raw_data/boston_fb_sci.csv") %>% 
  mutate(scaled_sci = rescale(log(scaled_sci), to = c(0, 1)))  %>%
  left_join(by = c("from" = "geoid"), 
            y = points %>% as_tibble() %>% select(geoid, from_x = x, from_y = y)) %>%
  left_join(by = c("to" = "geoid"), 
            y = points %>% as_tibble() %>% select(geoid, to_x = x, to_y = y)) %>%
  filter(from_x != to_x, from_y != to_y)

g1 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = total), color = "white", size = 0.1) +
  geom_sf(data = csub, color = "black", size = 1.5, fill = NA) +
  labs(fill = "Rate of Total<br>Social<br>Infrastructure") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "white", trans = "log",
                      breaks = c(0.07, .36, 1, 5.26),
                        labels =c("0.07", ".36", "1","5.26")) +
  theme(legend.text = element_text(hjust = 0))

g2 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = between), color = "white", size = 0.1) +
  geom_sf(data = csub, color = "black", size = 1.5, fill = NA) +
  labs(fill = "Connectivity<br>Between<br>Zipcodes") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "#DC267F", na.value = "black",
                      breaks = c(0, 0.33, 0.66, 1), labels = c("0", ".33", ".66", "1")) +
  theme(legend.text = element_text(hjust = 0))

g3 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = within), color = "white", size = 0.1) +
  geom_sf(data = csub, color = "black", size = 1.5, fill = NA) +
  labs(fill = "Connectivity<br>Within<br>Same Zipcode") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "#648FFF", na.value = "black", 
                      breaks = c(0, 0.33, 0.66, 1), labels = c("0", ".33", ".66", "1"))+
  theme(legend.text = element_text(hjust = 0))

combo <- ggpubr::ggarrange(g1,g2,g3, nrow = 1, legend = "bottom")
ggsave(combo, filename = "raw_data/map.png", dpi = 500, width = 10, height = 4.5)

12. Map 2

allsites <- read_csv("raw_data/boston_social_infra_2022_03_03.csv") %>%
  st_as_sf(coords = c("x", "y"), crs = 4326)

# And let's reimport the grid
grid <- read_sf("raw_data/grid_covariates_tracts.kml") %>%
  select(-c(Name:icon)) %>%
  st_join(csub %>% select(geometry), left = FALSE)


b1 <- ggplot() +
  geom_sf(data = bg, color = "lightgrey", fill = "black", size = 2)  +
  geom_sf(data = zipdata, mapping = aes(fill = pop_density),
          color = "white", size = 0.1) +
  geom_sf(data = grid, fill = NA, color = "darkgrey", size = 0.7) +
  geom_sf(data = grid, fill = NA, color = "white", size = 0.5) +
  geom_sf(data = csub, color = "white", size = 1.5, fill = NA) +
  geom_sf(data = csub, color = "black", size = 1, fill = NA) +
  geom_sf(data = allsites, shape = 21, color = "white", fill = "#785EF0", size = 1.5) +
  # Crop the map
  coord_sf(xlim = c(-71.2, -70.95),
           ylim = c(42.23, 42.40)) +
  labs(fill = "Population Density<br>per 1000 residents<br>per 1 km<sup>2</sup>") +
  theme_void(base_size = 14) +
  theme(plot.caption = ggtext::element_markdown(size = 10, hjust = 0),
        legend.title = ggtext::element_markdown(size = 12, hjust = 0)) +
  scale_fill_gradient(low = "black", high = "white", na.value = "black", 
                      trans = "log", guide = guide_colorbar(barwidth = 10, frame.colour = "#373737"),
                      breaks = c(1000, 2000, 3000, 6000, 10000, 16000, 30000),
                      labels = c("1", "2", "3", "6", "10", "16", "30")) +
  theme(legend.text = element_text(hjust = 0),
        legend.position = "bottom") 

# And let's view some distributions too, to demonstrate that they are right skewed.

b2 <- read_rds("raw_data/dataset.rds")  %>%
  pivot_longer(cols = c(total, community_spaces, social, places_of_worship, parks),
               names_to = "variable", values_to = "statistic") %>%
   mutate(variable = variable %>% factor() %>% recode_factor(
    "total" = "Total",
    "community_spaces" = "Community Spaces",
    "places_of_worship" = "Places of Worship",
    "social" = "Social Busineses",
    "parks" = "Parks")) %>%
  ggplot(mapping = aes(x = statistic, fill = variable)) +
  geom_density(mapping = aes(y = ..scaled..*100), fill = "#785EF0") +
  geom_text(mapping = aes(x = 2, y = 50, label = variable), hjust = 0, vjust = 0) +
  facet_grid(rows = vars(variable)) +
  guides(fill = "none") +
  viridis::scale_fill_viridis(discrete = TRUE, option = "plasma", begin = 0, end = 0.8) +
  scale_y_continuous(breaks = c(0, 50, 100), labels = c("0", "50", "100")) +
  scale_x_continuous(expand = expansion(add = c(0,0))) +
  theme_minimal(base_size = 14) +
  theme(panel.grid = element_blank(), 
        strip.background = element_blank(), strip.text = element_blank(),
        axis.line.y.left = element_line(color = "black"),
        panel.spacing.y = unit(0.5, "cm"),
        plot.margin = margin(0,0,0,0,"cm"),
        plot.subtitle = element_text(hjust = 0.5)) +
  labs(y = NULL,  x = "Average Zipcode Rate of Social Infrastructure\nper 1000 residents per 1 square kilometer",
       subtitle = "% of Zipcodes by Social Infrastructure")

combo <- ggpubr::ggarrange(b1,b2, nrow = 1, labels = c("A", "B"))
ggsave(combo, filename = "raw_data/map_distributions.png", dpi = 500, width = 9, height = 5)

13. Summary Statistics

z1 <- zipdata %>%
  na.omit() %>%
  as_tibble() %>%
  select(-geometry) %>%
  mutate(pop_density = pop_density / 1000,
         median_income = median_income / 1000) %>%
  pivot_longer(cols = c(pop_density:between), names_to = "variable", values_to = "value") %>%
  group_by(variable) %>%
  summarize(mean = mean(value, na.rm = TRUE) %>% round(3),
            sd = sd(value, na.rm = TRUE) %>% round(3),
            min = min(value, na.rm = TRUE) %>% round(3),
            median = median(value, na.rm = TRUE) %>% round(3),
            max = max(value, na.rm = TRUE) %>% round(3),
            nobs = sum(!is.na(value))) %>%
  ungroup() %>%
  pivot_longer(cols = c(mean, sd, min, median, max), names_to = "type", values_to = "stat") %>%
  mutate(group = case_when(variable %in% c("between", "within",
                                           "total", "community_spaces", "places_of_worship",
                                           "parks", "social") ~ "Main\nVariables",
                           TRUE ~ "Demographic\nVariables"),
         group = factor(group, levels = c("Main\nVariables", "Demographic\nVariables"))) %>%
  mutate(variable = variable %>% dplyr::recode_factor(
    "between" = "Ties Between Different Zipcodes",
    "within" = "Ties Within Same Zipcode",
    "total" = "Total Social Infrastructure Rate",
    "community_spaces" = "Community Spaces Rate",
    "places_of_worship" = "Places of Worship Rate",
    "social" = "Social Businesses Rate",
    "parks" = "Parks Rate",
    "pop_density" = "Population Density",
    "pop_women" = "% Women",
    "pop_age_65_plus" = "% Over Age 65",
    "pop_black" = "% Black",
    "pop_white" = "% White",
    "pop_hisplat" = "% Hispanic/Latino",
    "pop_asian" = "% Asian",
    "pop_natam" = "% Native American",
    "pop_pacific" = "% Pacific Islander",
    "pop_some_college" = "% Some College",
    "pop_unemployed" = "% Unemployed",
    "median_income" = "Median Income (1000s)",
    "median_monthly_housing_cost" =  "Median Monthly\nHousing Cost", 
    "income_inequality" = "Gini Coefficient") %>%
      factor(levels = levels(.) %>% rev())) %>%
  mutate(type = type %>% recode_factor(
    "mean" = "Mean",
    "sd" = "Std. Dev.",
    "min" = "Min",
    "median" = "Median",
    "max" = "Max")) %>%
  ggplot(mapping = aes(y = variable, x = type, label = stat)) +
  geom_tile(fill = NA, color = "grey") +
  geom_text() +
  facet_grid(rows = vars(group), scales = "free_y", shrink = TRUE, space = "free") +
  theme_classic(base_size = 14) +
  theme(strip.background = element_blank(),
        strip.text.y = element_text(hjust = 0, angle = 0)) +
  scale_x_discrete(position = "top", expand = expansion(add = c(0,0))) +
  labs(x = "Descriptive Statistics for Zipcodes (n = 50)",
       y = NULL) +
  theme(panel.border = element_rect(fill = NA, color = "black"),
        axis.ticks = element_blank())

ggsave(z1, filename = "raw_data/descriptives.png", dpi = 500, width = 7, height = 7.5)

Social Connectedness in Boston

Tim Fraser & Khushi Patel

2022-03-29