Hi! This site will be our starting point for sharing coding, data, and new techniques for our research project. You can start your own RStudio.Cloud project with this data using the following link.
Note: It’s okay to skip the Data Prep section below;
this is just info on how I’ve compiled our data so far. The good stuff
starts in Section 2.
First, we’re going to run the following code to generate our data.
# Load packages
library(tidyverse)
library(sf)
# Load in the raw points
read_csv("raw_data/boston_social_infra_2022_03_03.csv") %>%
# Convert to sf format,
st_as_sf(
# using the x and y columns to make a sf point object
coords = c("x", "y"),
# setting the Coordinate Reference System (CRS) to 4326,
# which is the World Global System 1984 projection.
crs = 4326) %>%
# Filter to just points that were harvested from Google
#filter(source == "googlapi") %>%
# Filter to just these common types of social infrastructure
filter(type %in% c("Parks", "Social Businesses",
"Community Spaces", "Places of Worship")) %>%
# Save as an R Data file ".rds", which is easier for use. Use read_rds() to load.
write_rds("raw_data/sites.rds")Next, let’s load and format our grid cells.
read_sf("raw_data/grid_covariates_tracts.kml") %>%
# get rid of fluff
select(-c(Name:icon)) %>%
# Just look at main zone
#filter(str_detect(milestone, "M1|M2|M3|M4")) %>%
# save to file!
write_rds("raw_data/grid.rds")library(tidyverse)
library(sf)
read_sf("https://bostonopendata-boston.opendata.arcgis.com/datasets/boston::boston-buildings.geojson?outSR=%7B%22latestWkid%22%3A2249%2C%22wkid%22%3A102686%7D") %>%
select(building_id = BLDG_ID) %>%
write_rds("raw_data/buildings.rds")
# Convert buildings to points
read_rds("raw_data/buildings.rds") %>%
mutate(geometry = st_centroid(geometry)) %>%
# Do an inner join to just keep buildings within this grid
st_join(read_rds("raw_data/grid.rds") %>% select(cell_id, geometry), left = FALSE) %>%
mutate(building_id = building_id %>% na_if("")) %>%
filter(!is.na(building_id)) %>%
write_rds("raw_data/buildings_points.rds")library(tidyverse)
library(sf)
read_sf("https://bostonopendata-boston.opendata.arcgis.com/datasets/boston::boston-street-segments.geojson?outSR=%7B%22latestWkid%22%3A2249%2C%22wkid%22%3A102686%7D") %>%
select(street_id = STREET_ID, segment_id = SEGMENT_ID) %>%
# Do an inner join to just keep roads within this grid
st_join(read_rds("raw_data/grid.rds") %>% select(cell_id, geometry), left = FALSE) %>%
write_rds("raw_data/streets.rds")# Here's how you might visualize these all together
grid <- read_rds("raw_data/grid.rds")
sites <- read_rds("raw_data/sites.rds")
buildings <- read_rds("raw_data/buildings_points.rds") # >40,000 points
ggplot() +
geom_sf(data = grid) +
geom_sf(data = buildings, color = "blue", alpha = 0.25) +
geom_sf(data = sites)
rm(list = ls())Pick your case study! Which grid cell do you want? Explore on Google MyMaps, learn the number of the cell, and try out the analyses below on your chosen grid cell. If you get ideas - go wild!
library(tidyverse)
library(sf)
grid <- read_rds("raw_data/grid.rds")
grid %>% head()## Simple feature collection with 6 features and 21 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: -71.11646 ymin: 42.30699 xmax: -71.02922 ymax: 42.33873
## Geodetic CRS: WGS 84
## # A tibble: 6 × 22
## cell_id neighborhood milestone pop_density pop_women pop_white pop_black
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Cell 100 <NA> M3: Core Bost… NA 0.409 0.549 0.207
## 2 Cell 106 Mission Hill M2: Around No… 8368. 0.570 0.603 0.165
## 3 Cell 107 Roxbury M2: Around No… 9357. 0.524 0.472 0.259
## 4 Cell 108 Roxbury M2: Around No… 7430. 0.523 0.204 0.517
## 5 Cell 109 Roxbury M3: Core Bost… 6780. 0.476 0.254 0.522
## 6 Cell 110 South Boston M3: Core Bost… 6051. 0.492 0.547 0.259
## # … with 15 more variables: pop_natam <dbl>, pop_asian <dbl>,
## # pop_pacific <dbl>, pop_hisplat <dbl>, pop_some_college <dbl>,
## # employees_muni <dbl>, employees_state <dbl>, employees_fed <dbl>,
## # median_income <dbl>, income_inequality <dbl>, pop_unemployed <dbl>,
## # median_monthly_housing_cost <dbl>, pop_age_65_plus <dbl>,
## # pop_density_int <dbl>, geometry <POLYGON [°]>
ggplot() +
geom_sf(data = grid, mapping = aes(fill = neighborhood)) +
geom_sf_text(data = grid, mapping = aes(label = str_remove(cell_id, "Cell "))) +
theme_void()## Warning in st_point_on_surface.sfc(sf::st_zm(x)): st_point_on_surface may not
## give correct results for longitude/latitude data
Next, let’s try to replicate the distance analysis for every cell.
# Here's how you might visualize these all together
# Get the grid cell
grid <- read_rds("raw_data/grid.rds")
# Let's make a series of buffer zones around our grid cells.
# Let's make a 1km circle around our grid cells
buffer <- grid %>%
select(buffer_id = cell_id, geometry) %>%
st_buffer(dist = 1000)For example, we can see the shape of that buffer below!
ggplot() +
geom_sf(data = buffer) +
geom_sf(data = grid, fill = NA, color = "white")Next, let’s get a jumbo dataset, where each row indicates a site-buffer pair, showing every site that fell into every buffer. (In other words, some sites will show up multiple times, captured by multiple buffers. That’s okay - that’s intended.) Let’s also import our buildings, to make a gigantic dataset of building-buffer pairs. It’s not clear which size of buffer we need, so we’re going to do a bunch, all in a loop.
get_buffer = function(mydistance){
# Let's make an ZZZZ circle around our grid cells
buffer <- grid %>%
select(buffer_id = cell_id, geometry) %>%
# Specify the distance of the perimeter, using 'mydistance'
st_buffer(dist = mydistance)
# Let's import ALL the social infrastructure sites located within 1 square kilometers of each grid cell
read_rds("raw_data/sites.rds") %>%
# Lets zoom in JUST to sites overlapping our 1 km buffer, doing an inner_join (as opposed to a left_join)
st_join(buffer, left = FALSE) %>%
# Let's keep a copy
write_rds(paste("raw_data/buffer_sites_", mydistance, ".rds", sep = ""))
read_rds("raw_data/buildings_points.rds") %>%
# Each matched to a buffer!
# Lets zoom in JUST to sites overlapping our 1 km buffer, doing an inner_join (as opposed to a left_join)
st_join(buffer, left = FALSE) %>%
# let's keep a copy just in case
write_rds(paste("raw_data/buffer_buildings_", mydistance, ".rds", sep = ""))
# Now save the buffer to file, in case we need it.
buffer %>%
write_rds(paste("raw_data/buffer_", mydistance, ".rds", sep = ""))
}
# Run the loop for 0, 100, 500, 1000, 1500, and 2000 meter buffers
c(0, 100, 500, 1000, 1500, 2000) %>%
map(~get_buffer(.))Now, we’re going to do something called a function. In a
function, we write a loop of code that we ask the computer to do several
times in a row, given an input. Only the input changes, so your choice
of input matters - in this case, we will supply the function with each
of our differnt buffer_id, so that we can run this code
many times, once per buffer.
get_distance = function(mybuffer_id){
require(tidyverse)
require(sf)
# Show progress
print(mybuffer_id)
# Import our subset data
samplebuildings <- buildings %>%
# Zoom into just buildings within our cell of interest
filter(cell_id %in% mybuffer_id)
samplesites <- sites %>%
# Zoom into just sites within our the buffer of our cell of interest
filter(buffer_id %in% mybuffer_id) %>%
select(id, type)
# If we have any valid data in this cell to analyze, then do the following:
if(length(samplebuildings$building_id) > 0 & length(samplesites$id) > 0){
# Collect a set of unique identifier pairs for the lines we're about the make
# Use expand grid to get a complete set of buildings and sites,
# In the same order supplied to st_nearest_points
mydetails <- expand_grid(
# Grab the building ID
samplebuildings %>% as_tibble() %>% select(building_id),
# Grab the site id and type
samplesites %>% as_tibble() %>% select(id, type))
mylines <- samplebuildings %>%
# Please give me lines between my building and every one of our sites
st_nearest_points(samplesites) %>%
# Format as an sf object, with the WGS 84 coordinate reference system (code = 4326)
# and rename the weird column to geometry
st_as_sf(crs = 4326) %>% rename(geometry = x) %>%
# Now calculate the distance of these lines, in meters!
mutate(dist = st_length(geometry) %>% as.numeric()) %>%
as_tibble() %>%
select(dist) %>%
# Bind in unique identifiers
bind_cols(mydetails %>% select(building_id, type))
mylines %>%
# For each type, per grid cell buffer,
group_by(building_id, type) %>%
# Calculate median distance using 10 thresholds,
summarize(
# let's get median distance as crow flies between buildings in this block
# and social infrastructure of this type
# within 100 feet of that building
dist100 = median(dist[dist <= 100], na.rm = TRUE),
# within 200 feet of that building
dist200 = median(dist[dist <= 200], na.rm = TRUE),
# within 300 feet of that building
dist300 = median(dist[dist <= 300], na.rm = TRUE),
# within 400 feet of that building
dist400 = median(dist[dist <= 400], na.rm = TRUE),
# et cetera
dist500 = median(dist[dist <= 500], na.rm = TRUE),
dist600 = median(dist[dist <= 600], na.rm = TRUE),
dist700 = median(dist[dist <= 700], na.rm = TRUE),
dist800 = median(dist[dist <= 800], na.rm = TRUE),
dist900 = median(dist[dist <= 900], na.rm = TRUE),
dist1000 = median(dist[dist <= 1000], na.rm = TRUE),
# Also be sure to use no threshold once too
dist = median(dist, na.rm = TRUE),
# Let's also count the total sites that fall into that area!
count100 = sum(!is.na(type[dist <= 100]), na.rm = TRUE),
# within 200 feet of that building
count200 = sum(!is.na(type[dist <= 200]), na.rm = TRUE),
# within 300 feet of that building
count300 = sum(!is.na(type[dist <= 300]), na.rm = TRUE),
# within 400 feet of that building
count400 = sum(!is.na(type[dist <= 400]), na.rm = TRUE),
# et cetera
count500 = sum(!is.na(type[dist <= 500]), na.rm = TRUE),
count600 = sum(!is.na(type[dist <= 600]), na.rm = TRUE),
count700 = sum(!is.na(type[dist <= 700]), na.rm = TRUE),
count800 = sum(!is.na(type[dist <= 800]), na.rm = TRUE),
count900 = sum(!is.na(type[dist <= 900]), na.rm = TRUE),
count1000 = sum(!is.na(type[dist <= 1000]), na.rm = TRUE),
# Also be sure to use no threshold once too
count = sum(!is.na(type), na.rm = TRUE),
# Finally, let's append that buffer ID
buffer_id = mybuffer_id) %>%
ungroup() %>%
# Write that cell to file
write_rds(paste("count/", mybuffer_id, ".rds", sep = ""))
}
}
get_distance %>%
write_rds("raw_data/get_distance_function.rds")
remove(grid, buffer) Finally, let’s run the loop!
# First, let's load our data in.
buildings <- read_rds("raw_data/buildings.rds") %>%
st_join(read_rds("raw_data/grid.rds") %>% select(cell_id))
buffer <- read_rds("raw_data/buffer_1000.rds")
sites <- read_rds("raw_data/buffer_sites_1000.rds") %>% select(id, type, buffer_id)
get_distance <- read_rds("raw_data/get_distance_function.rds")
# Make a folder to hold our results
dir.create("count")
# Note: I wouldn't recommend running this code.
# It took me about 25 minute with 8GB of RAM.
# A usual RStudio Cloud Project has 1 GB of RAM.
library(tidyverse)
library(sf)
library(future)
library(furrr)
plan("multisession")
buffer$buffer_id %>%
#done <- str_remove(dir("count"), ".rds")
#buffer$buffer_id[!buffer$buffer_id %in% done] %>%
furrr::future_map(~get_distance(.), .progress = TRUE)
plan("sequential")
# Now bind the results together into one data.frame
dir("count", full.names = TRUE) %>%
map_dfr(~read_rds(.)) %>%
write_rds("raw_data/buffer_dist.rds")rm(list = ls())Let’s also grab, as covariates, the type of zoning for each building and the cost of that building.
library(tidyverse)
library(sf)
# https://data.boston.gov/dataset/property-assessment/resource/c4b7331e-e213-45a5-adda-052e4dd31d41?inner_span=True
read_csv("https://data.boston.gov/dataset/e02c44d2-3c64-459c-8fe2-e1ce5f38a035/resource/c4b7331e-e213-45a5-adda-052e4dd31d41/download/data2021-full.csv") %>%
magrittr::set_colnames(value = names(.) %>% tolower() %>% str_replace_all(" ", "_")) %>%
select(pid, lu, own_occ,
total_value,land_value, bldg_value,
gross_area, land_area = land_sf, bldg_area = living_area,
gross_tax, yr_built,yr_remodel, overall_cond) %>%
# Convert land value to numeric, and calculate total value per square foot (of land and property)
mutate_at(vars(total_value, land_value, bldg_value, gross_tax), list(~parse_number(.))) %>%
# Convert categories to factor!
mutate_at(vars(own_occ, lu, overall_cond), list(~factor(.))) %>%
mutate(lu = lu %>% recode_factor(
"A" = "Apartment Building (7 or more units)",
"AH" = "Agricultural/Horticultural",
"C" = "Commericial",
"CC" = "Commercial Condo",
"CD" = "Residential Condo",
"CL" = "Commercial Land",
"CM" = "Condo main structure",
"CP" = "Condo parking",
"E" = "Tax-Exempt",
"EA" = "Tax-Exempt (blighted)",
"I" = "Industrial",
"R1" = "Residential 1-family",
"R2" = "Residential 2-family",
"R3" = "Residential 3-family",
"R4" = "Residential 4+family",
"RC" = "Mixed use",
"RL" = "Residential Land")) %>%
# Recode variable to an ordinal scale from 1 (unsound) to 8 (excellent)
mutate(overall_cond = overall_cond %>% recode_factor(
"EX - Excellent" = "8",
"E - Excellent" = "8",
"VG - Very Good" = "7",
"G - Good" = "6",
"AVG - Default - Average" = "5",
"A - Average" = "5",
"F - Fair" = "4",
"P - Poor" = "3",
"VP - Very Poor" = "2",
"US - Unsound" = "1") %>% as.character() %>% as.numeric()) %>%
# Turn owner occupied into a binary variable
mutate(own_occ = if_else(own_occ == "Y", true = 1, false = 0, missing = NA_real_)) %>%
# Calculate cost per square foot
mutate(cost_sqft = total_value / gross_area) %>%
# A couple records were duplicated. Take the median of each
group_by(pid) %>%
summarize(lu = unique(lu),
own_occ = median(own_occ, na.rm = TRUE),
cost_sqft = median(cost_sqft, na.rm = TRUE),
overall_cond = median(overall_cond, na.rm = TRUE),
yr_built = median(yr_built, na.rm = TRUE),
gross_tax = median(gross_tax, na.rm = TRUE)) %>%
ungroup() %>%
write_rds("raw_data/buildings_value.rds")We’re also going to estimate distance of all buildings from the nearest bus and train stop.
library(tidyverse)
library(sf)
train <- read_sf("raw_data/transit/MBTA_NODE.shp") %>%
st_transform(crs = 4326) %>% select(station = STATION, line = LINE) %>%
# make an extra geometry column
mutate(geo_transit = geometry)
bus <- read_sf("raw_data/transit/MBTABUSSTOPS_PT.shp") %>%
st_transform(crs = 4326) %>%
select(stop_id = STOP_ID) %>%
# make an extra geometry column
mutate(geo_transit = geometry)
buildings <- read_rds("raw_data/buildings_points.rds") %>%
rename(geo_building = geometry)
# Identify which train stop is closest to each building site
buildings %>%
st_join(train, join = st_nearest_feature) %>%
# Next, we're going to bind in the coordinates of
bind_cols(st_coordinates(.$geo_building) %>% as_tibble() %>% select(x1 = 1, y1 = 2),
st_coordinates(.$geo_transit) %>% as_tibble() %>% select(x2 = 1, y2 = 2)) %>%
# Filter out any locations that aren't quite right
#filter(!is.na(x1) & !is.na(y1) & !is.na(x2) & !is.na(y2)) %>%
select(building_id, line, x1:y2) %>%
# Make a linestring
mutate(geometry = sprintf("LINESTRING(%s %s, %s %s)", x1, y1, x2, y2)) %>%
as_tibble() %>%
select(building_id, line, geometry) %>%
st_as_sf(wkt = "geometry", crs = 4326) %>%
# And, let's calculate how far away it is!
mutate(dist = st_length(geometry) %>% as.numeric()) %>%
as_tibble() %>%
select(building_id, train_dist = dist, train_nearest = line) %>%
write_rds("raw_data/buildings_train.rds")
# Identify which bus stops is closest to each building site
buildings %>%
st_join(bus, join = st_nearest_feature) %>%
# Next, we're going to bind in the coordinates of
bind_cols(st_coordinates(.$geo_building) %>% as_tibble() %>% select(x1 = 1, y1 = 2),
st_coordinates(.$geo_transit) %>% as_tibble() %>% select(x2 = 1, y2 = 2)) %>%
# Filter out any locations that aren't quite right
#filter(!is.na(x1) & !is.na(y1) & !is.na(x2) & !is.na(y2)) %>%
select(building_id, stop_id, x1:y2) %>%
# Make a linestring
mutate(geometry = sprintf("LINESTRING(%s %s, %s %s)", x1, y1, x2, y2)) %>%
as_tibble() %>%
select(building_id, stop_id, geometry) %>%
st_as_sf(wkt = "geometry", crs = 4326) %>%
# And, let's calculate how far away it is!
mutate(dist = st_length(geometry) %>% as.numeric()) %>%
as_tibble() %>%
select(building_id, bus_dist = dist, bus_nearest = stop_id) %>%
write_rds("raw_data/buildings_bus.rds")
train <- read_rds("raw_data/buildings_train.rds")
bus <- read_rds("raw_data/buildings_bus.rds")
# The codes are identical, so we can bind_cols()
# sum(train$building_id == bus$building_id)
bind_cols(train, bus %>% select(bus_dist, bus_nearest)) %>%
# Where we can access distance to closest bus or train
write_rds("raw_data/buildings_transit.rds")
rm(list = ls())library(tidyverse)
grid <- read_rds("raw_data/grid.rds") %>%
as_tibble() %>%
select(-geometry) %>%
mutate(neighborhood = na_if(neighborhood, "")) %>%
mutate(neighborhood = if_else(is.na(neighborhood), "Other", neighborhood))
dat <- read_rds("raw_data/buildings_value.rds")
transit <- read_rds("raw_data/buildings_transit.rds") %>%
# There were a couple of duplicates;
# let's take the smallest distance to compensate
group_by(building_id) %>%
summarize(train_dist = min(train_dist, na.rm = TRUE),
bus_dist = min(bus_dist, na.rm = TRUE)) %>%
ungroup()
mydist <- read_rds("raw_data/buffer_dist.rds") %>%
# There are 275 builings in Boston that are unlabelled. We will just remove them.
filter(nchar(building_id) > 1) %>%
# Collect the following variables, telling us...
# This building in THAT CELL was a median of this far away from social infrastructure sites of TYPE X within THAT RADIUS?
select(cell_id = buffer_id, building_id, type, dist = dist1000) %>%
mutate(type = type %>% recode_factor(
"Community Spaces" = "community",
"Places of Worship" = "worship",
"Social Businesses" = "social",
"Parks" = "parks")) %>%
pivot_wider(id_cols = c(building_id, cell_id), names_from = type, values_from = dist) %>%
# Some buildings got caught in multiple grid cells, presumably because they site at the cross between several lines
# # eg. building_id == "Bos_0104067000_B0" is in Cell 184, CEll 185, Cell 195, and Cell 196's buffer.
# However, THANK GOODNESS, their distance measurements are all the same.
# so, we're just going to consolidate them into unique records,
group_by(building_id) %>%
summarize(
# Get the modal cell,
cell_id = cell_id %>% table() %>% sort(decreasing = TRUE) %>% names() %>% .[1],
# and taking the median of their entries
community = median(community, na.rm = TRUE),
parks = median(parks, na.rm = TRUE),
worship = median(worship, na.rm = TRUE),
social = median(social, na.rm = TRUE)) %>%
ungroup() %>%
left_join(by = "cell_id", y = grid %>%
select(cell_id, neighborhood, pop_density_int, pop_white, pop_black, pop_hisplat, pop_asian,
pop_some_college, median_income, income_inequality, median_monthly_housing_cost)) %>%
# Get the unique building code.
# Some buildings have multiple entries, as a B0 & B1 (It only happens rarely, so this describes places that have extensions large enough to render themselves their own polygons.)
mutate(id = str_extract(building_id, pattern = "[0-9]+")) %>%
# Now join in Property Records
left_join(by = c("id" = "pid"), y = dat) %>%
# 89999
# Now join in distance to public transit
left_join(by = "building_id", y = transit) %>%
# Save to file
write_rds("building_dist_dataset.rds")
rm(list=ls())library(GGally)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
mydat <- read_rds("building_dist_dataset.rds") %>%
# Filter to sites within boston neighborhoods
filter(neighborhood != "Other")
viz <- mydat %>%
mutate_at(vars(social, community, parks, worship), list(~.^2)) %>%
select(community, worship, social,parks,
pop_density_int, pop_black, pop_hisplat, pop_asian,
pop_some_college, median_income, income_inequality, median_monthly_housing_cost, #lu,
train_dist, bus_dist, overall_cond, own_occ, yr_built, gross_tax, cost_sqft) %>%
GGally::ggcorr(hjust = 1, label = TRUE, label_color = "#373737",
digits = 1, hjust = 1, layout.exp = 5) %>%
with(data) %>%
mutate_at(vars(x, y), list(~factor(.) %>% recode_factor(
"community" = "Community Spaces",
"worship" = "Places of Worship",
"social" = "Social Businesses",
"parks" = "Parks",
"pop_density_int" = "Pop. Density",
"pop_black" = "% Black",
"pop_hisplat" = "% Hispanic/Latino",
"pop_asian" = "% Asian",
"pop_some_college" = "% Some College",
"median_income" = "Median Income",
"income_inequality" = "Income Inequality",
"median_monthly_housing_cost" = "Median Monthly Housing Cost",
"train_dist" = "Distance to Train",
"bus_dist" = "Distance to Bus",
"overall_cond" = "Building Condition",
"own_occ" = "Owner Occupied",
"yr_built" = "Year Built",
"gross_tax" = "Gross Tax",
"cost_sqft" = "Cost per sq. ft."))) %>%
mutate(xnum = as.numeric(x),
ynum = as.numeric(y)) ## Warning: Duplicated aesthetics after name standardisation: hjust
viz %>%
ggplot(mapping = aes(x = reorder(x, xnum), y = y, fill = coefficient, label = label)) +
geom_tile() +
geom_text() +
geom_tile(data = . %>% filter(abs(coefficient) >= 0.75),
fill = NA, color = "black", size = 1) +
scale_fill_gradient2(low = "#DC267F", mid = "white",
high = "#648FFF", midpoint = 0,
limits = c(-1, 1),
breaks = c(-1, -0.75, -0.5, -0.25, 0, .25, 0.5, 0.75, 1)) +
labs(caption = "Note: Black squares denote correlation over +/-0.75.\nParks, Social Businesses, Places of Worship, and Community Spaces refer to median distance to locations.",
fill = "Pearson's r\nCorrelation Coefficient") +
theme_classic(base_size = 14) +
scale_y_discrete(position = "right", expand = expansion(0)) +
scale_x_discrete(position = "bottom", expand = expansion(0)) +
labs(x = NULL, y = NULL) +
theme(legend.position = "bottom",
axis.line = element_blank(),
axis.ticks = element_blank(),
plot.caption = element_text(size = 12, hjust = 0),
legend.title = element_text(size = 12, hjust = 0),
axis.text.y.right = element_text(hjust = 0),
axis.text.x.bottom = element_text(hjust = 0, angle = -30)) +
guides(fill = guide_colorsteps(barwidth = 25, barheight = 2))Looks like median income and median monthly housing cost can’t be in the same model here. Other than that, we’re doing pretty good!
mydat <- read_rds("building_dist_dataset.rds") %>%
# Filter to sites within boston neighborhoods
filter(neighborhood != "Other")
# What kinds of places end up with closer access to social infrastructure?
m0 <- mydat %>%
lm(formula = I(community^2) ~ pop_density_int +
# Race, education, and income are deeply correlated in Boston.
# To keep collinearity below the VIF < 10 threshold,
# we use just one variable for race - share of white residents
# and split it into evenly sized quartiles as a numeric variable
# We also log-transformed median income, since it was right skewed
ntile(pop_white, 4) + log(median_income) + #income_inequality +
# We also log transformed distance variables, since they were right skewed
log(train_dist) + log(bus_dist) +
# We also log transformed the building value, for right skew
log(cost_sqft) + overall_cond + own_occ + yr_built + lu +
# It's not possible to control for grid cells, due to high collinearity,
# But we added fixed effects for neighborhood to adjust
# for some natural geographic correlation
neighborhood)
m0 %>% car::vif() %>% .^2## GVIF Df GVIF^(1/(2*Df))
## pop_density_int 30.979601 1 5.565932
## ntile(pop_white, 4) 80.012981 1 8.944998
## log(median_income) 50.251770 1 7.088848
## log(train_dist) 8.969525 1 2.994917
## log(bus_dist) 1.353869 1 1.163559
## log(cost_sqft) 2.024461 1 1.422836
## overall_cond 1.459624 1 1.208149
## own_occ 2.449333 1 1.565035
## yr_built 1.661020 1 1.288806
## lu 8.588322 100 1.113513
## neighborhood 7624.880729 256 1.322269
# Here's what the quartiles look like
mydat %>%
group_by(breaks = ntile(pop_white, 4)) %>%
summarize(lower = min(pop_white, na.rm = TRUE),
upper = max(pop_white, na.rm = TRUE))## # A tibble: 4 × 3
## breaks lower upper
## <int> <dbl> <dbl>
## 1 1 0.0610 0.355
## 2 2 0.355 0.585
## 3 3 0.585 0.724
## 4 4 0.724 0.933
rm(list = ls())mydat <- read_rds("building_dist_dataset.rds") %>%
filter(neighborhood != "Other")
# Distance to Community Spaces
m1 <- mydat %>%
lm(formula = I((community/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood)
#-12.33
m2 <- mydat %>%
lm(formula = I((community/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft))
m3 <- mydat %>%
lm(formula = I((community/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built)
# Distance to Places of Worship
m4 <- mydat %>%
lm(formula = I((worship/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood)
m5 <- mydat %>%
lm(formula = I((worship/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft))
m6 <- mydat %>%
lm(formula = I((worship/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built)
# Distance to Social Businesses
m7 <- mydat %>%
lm(formula = I((social/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood)
m8 <- mydat %>%
lm(formula = I((social/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft))
m9 <- mydat %>%
lm(formula = I((social/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built)
# Distance to Parks
m10 <- mydat %>%
lm(formula = I((parks/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood)
m11 <- mydat %>%
lm(formula = I((parks/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist))
m12 <- mydat %>%
lm(formula = I((parks/1000)^2) ~ pop_density_int +
ntile(pop_white, 4) + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built)
get_vif = function(mymodel){
mymodel %>% car::vif() %>% .^2 %>% .[,3] %>% max()
}
# Calculate VIF for every model, in a list
list(m1,m2,m3,m4,m5,m6,m7,m8,m9, m10,m11,m12) %>%
map(~get_vif(.)) %>%
write_rds("raw_data/myvif.rds")
# Extract model details, in a list
list(m1,m2,m3,m4,m5,m6,m7,m8,m9, m10,m11,m12) %>%
map(~texreg::extract(.)) %>%
write_rds("raw_data/mymodels.rds")
rm(list = ls())mymodels <- read_rds("raw_data/mymodels.rds")
myvif <- read_rds("raw_data/myvif.rds")
texreg::htmlreg(
mymodels,
custom.header = list("Median Distance (Squared) to<br>Community Spaces" = 1:3,
"Median Distance (Squared) to<br>Places of Worship" = 4:6,
"Median Distance (Squared) to<br>Social Businesses" = 7:9,
"Median Distance (Squared) to<br>Parks" = 10:12),
custom.model.names = c(
"Model 1<br>Area Traits",
"Model 2<br>Amenities",
"Model 3<br>Extended Controls",
"Model 4<br>Area Traits",
"Model 5<br>Amenities",
"Model 6<br>Extended Controls",
"Model 7<br>Area Traits",
"Model 8<br>Amenities",
"Model 9<br>Extended Controls",
"Model 10<br>Area Traits",
"Model 11<br>Amenities",
"Model 12<br>Extended Controls"),
bold = 0.10, stars = c(0.001, 0.01, 0.05, 0.1),
custom.gof.rows = list("Max VIF" = myvif %>% unlist()),
omit.coef = "neighborhood",
caption.above = TRUE,
caption = "<b>Ordinary Least Squares Models of Boston Buildings (n = 85,592)</b>
<br>
<b>Dependent Variable<b>: Median Distance Squared from Buildings to Nearby Social Infrastructure, measured in kilometers.",
custom.note = "<b>Statistical Significance</b>: *** p < 0.001, ** p < 0.01, * p < 0.05, . p < 0.10.
<br>
<b>Fixed Effects:</b> All models include 17 fixed effects for neighborhood, including East Boston, Charleston, Beacon Hill, Downtown, South Boston, South End, Fenway/Kenmore, Back Bay, Roxbury, Mission Hill, Dorchester, Jamaica Plain, Mattapan, Hyde Park, Roslindale, West Roxbury, and Allston/Brighton."
) %>%
htmltools::HTML()| Â | Median Distance (Squared) to Community Spaces |
Median Distance (Squared) to Places of Worship |
Median Distance (Squared) to Social Businesses |
Median Distance (Squared) to Parks |
||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Â | Model 1 Area Traits |
Model 2 Amenities |
Model 3 Extended Controls |
Model 4 Area Traits |
Model 5 Amenities |
Model 6 Extended Controls |
Model 7 Area Traits |
Model 8 Amenities |
Model 9 Extended Controls |
Model 10 Area Traits |
Model 11 Amenities |
Model 12 Extended Controls |
| (Intercept) | 0.38*** | 0.49*** | -0.20* | -0.38*** | -0.36*** | -1.42*** | 1.18*** | 0.95*** | -0.62*** | 0.79*** | 0.71*** | 0.26*** |
| Â | (0.06) | (0.07) | (0.10) | (0.05) | (0.05) | (0.07) | (0.06) | (0.06) | (0.08) | (0.04) | (0.04) | (0.07) |
| pop_density_int | -0.00*** | -0.00*** | -0.00*** | -0.00*** | -0.00*** | -0.00*** | -0.00*** | -0.00*** | -0.00*** | -0.00** | -0.00* | -0.00* |
| Â | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) |
| ntile(pop_white, 4) | -0.04*** | -0.04*** | -0.04*** | -0.01*** | -0.01*** | -0.01*** | 0.00 | 0.00 | 0.00 | 0.01*** | 0.01*** | 0.01*** |
| Â | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) | (0.00) |
| log(median_income) | 0.02*** | 0.01 | 0.00 | 0.08*** | 0.08*** | 0.08*** | -0.06*** | -0.06*** | -0.06*** | -0.03*** | -0.03*** | -0.03*** |
| Â | (0.01) | (0.01) | (0.01) | (0.00) | (0.00) | (0.01) | (0.01) | (0.01) | (0.01) | (0.00) | (0.00) | (0.00) |
| log(train_dist) |  | -0.00 | -0.00· |  | -0.01*** | -0.01*** |  | 0.02*** | 0.02*** |  | 0.01*** | 0.01*** |
| Â | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) |
| log(bus_dist) | Â | 0.01*** | 0.01*** | Â | 0.01*** | 0.00*** | Â | 0.01*** | 0.00** | Â | 0.01*** | 0.00*** |
| Â | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) |
| log(cost_sqft) | Â | 0.00*** | 0.00 | Â | 0.01*** | 0.00 | Â | 0.01*** | 0.01*** | Â | Â | 0.00 |
| Â | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) | Â | (0.00) | (0.00) | Â | Â | (0.00) |
| luCommericial | Â | Â | -0.00 | Â | Â | -0.01 | Â | Â | -0.02*** | Â | Â | -0.01 |
| Â | Â | Â | (0.01) | Â | Â | (0.00) | Â | Â | (0.01) | Â | Â | (0.00) |
| luCommercial Land | Â | Â | -0.05 | Â | Â | -0.01 | Â | Â | 0.04 | Â | Â | 0.00 |
| Â | Â | Â | (0.06) | Â | Â | (0.04) | Â | Â | (0.05) | Â | Â | (0.04) |
| luTax-Exempt | Â | Â | -0.01* | Â | Â | 0.02** | Â | Â | 0.03*** | Â | Â | 0.01** |
| Â | Â | Â | (0.01) | Â | Â | (0.00) | Â | Â | (0.01) | Â | Â | (0.00) |
| luTax-Exempt (blighted) | Â | Â | -0.08*** | Â | Â | -0.02* | Â | Â | -0.00 | Â | Â | 0.03** |
| Â | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.01) |
| luIndustrial |  |  | 0.03· |  |  | 0.03* |  |  | -0.03* |  |  | -0.01 |
| Â | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.01) |
| luResidential 1-family |  |  | 0.03*** |  |  | 0.04*** |  |  | 0.04*** |  |  | 0.01· |
| Â | Â | Â | (0.01) | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) |
| luResidential 2-family | Â | Â | 0.02*** | Â | Â | 0.03*** | Â | Â | 0.03*** | Â | Â | -0.01** |
| Â | Â | Â | (0.01) | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) |
| luResidential 3-family |  |  | 0.01** |  |  | 0.02*** |  |  | 0.00 |  |  | -0.01· |
| Â | Â | Â | (0.01) | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) |
| luResidential 4+family | Â | Â | 0.00 | Â | Â | 0.01** | Â | Â | -0.00 | Â | Â | -0.00 |
| Â | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.00) |
| luMixed use | Â | Â | 0.01 | Â | Â | 0.01* | Â | Â | -0.01 | Â | Â | 0.00 |
| Â | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.01) | Â | Â | (0.00) |
| overall_cond | Â | Â | 0.00 | Â | Â | -0.00** | Â | Â | -0.01*** | Â | Â | 0.00 |
| Â | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) |
| own_occ |  |  | 0.00· |  |  | 0.00 |  |  | 0.00** |  |  | -0.00 |
| Â | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) |
| yr_built | Â | Â | 0.00*** | Â | Â | 0.00*** | Â | Â | 0.00*** | Â | Â | 0.00*** |
| Â | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) | Â | Â | (0.00) |
| Max VIF | 8.94 | 8.83 | 8.94 | 8.95 | 8.85 | 8.96 | 8.79 | 8.66 | 8.78 | 8.74 | 8.74 | 8.74 |
| R2 | 0.05 | 0.05 | 0.05 | 0.14 | 0.13 | 0.14 | 0.09 | 0.09 | 0.11 | 0.05 | 0.05 | 0.05 |
| Adj. R2 | 0.05 | 0.05 | 0.05 | 0.14 | 0.13 | 0.14 | 0.09 | 0.09 | 0.11 | 0.05 | 0.05 | 0.05 |
| Num. obs. | 79194 | 68836 | 68398 | 85323 | 74647 | 74129 | 84829 | 74198 | 73690 | 84674 | 84663 | 73491 |
| Statistical Significance: *** p < 0.001, ** p < 0.01, * p < 0.05, . p < 0.10.
Fixed Effects: All models include 17 fixed effects for neighborhood, including East Boston, Charleston, Beacon Hill, Downtown, South Boston, South End, Fenway/Kenmore, Back Bay, Roxbury, Mission Hill, Dorchester, Jamaica Plain, Mattapan, Hyde Park, Roslindale, West Roxbury, and Allston/Brighton. |
||||||||||||
library(tidyverse)
library(Zelig)
library(viridis)
mydat <- read_rds("building_dist_dataset.rds") %>%
filter(neighborhood != "Other") %>%
mutate(pop_white = ntile(pop_white, 4)) %>%
mutate_at(vars(neighborhood, lu), list(~factor(.)))
# Let's run each model in Zelig format
z1 <- mydat %>%
zelig(formula = I((community/1000)^2) ~ pop_density_int +
pop_white + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built, model = "ls")
z2 <- mydat %>%
zelig(formula = I((worship/1000)^2) ~ pop_density_int +
pop_white + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built, model = "ls")
z3 <- mydat %>%
zelig(formula = I((social/1000)^2) ~ pop_density_int +
pop_white + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built, model = "ls")
z4 <- mydat %>%
zelig(formula = I((parks/1000)^2) ~ pop_density_int +
pop_white + log(median_income) + neighborhood +
log(train_dist) + log(bus_dist) + log(cost_sqft) +
lu + overall_cond + own_occ + yr_built, model = "ls")
# Here's what the quartiles look like
mystat <- read_rds("building_dist_dataset.rds") %>%
filter(neighborhood != "Other") %>%
group_by(breaks = ntile(pop_white, 4)) %>%
summarize(lower = min(pop_white, na.rm = TRUE),
median = median(pop_white, na.rm = TRUE),
upper = max(pop_white, na.rm = TRUE)) %>%
ungroup() %>%
mutate(label = paste(round(lower*100, 0), "-", round(upper*100, 0), "%", sep = ""))
# Get rid of unnecessary data
remove(mydat)
bind_rows(
z1 %>%
setx(pop_white = 1:4) %>% sim() %>%
zelig_qi_to_df() %>%
select(x = pop_white, ev = expected_value) %>%
mutate(type = "community"),
z2 %>%
setx(pop_white = 1:4) %>% sim() %>%
zelig_qi_to_df() %>%
select(x = pop_white, ev = expected_value) %>%
mutate(type = "worship"),
z3 %>%
setx(pop_white = 1:4) %>% sim() %>%
zelig_qi_to_df() %>%
select(x = pop_white, ev = expected_value) %>%
mutate(type = "social"),
z4 %>%
setx(pop_white = 1:4) %>% sim() %>%
zelig_qi_to_df() %>%
select(x = pop_white, ev = expected_value) %>%
mutate(type = "parks")) %>%
mutate(type = type %>% recode_factor(
"community" = "Community Spaces",
"worship" = "Places of Worship",
"social" = "Social Businesses",
"parks" = "Parks")) %>%
group_by(type, x) %>%
summarize(lower_ci = quantile(ev, probs = 0.025),
estimate = quantile(ev, probs = 0.50),
upper_ci = quantile(ev, probs = 0.975)) %>%
# Get better positions
left_join(by = c("x" = "breaks"), y = mystat) %>%
# Transform the result back into meters, not kilometers squared
mutate_at(vars(estimate, lower_ci, upper_ci), list(~sqrt(.)*1000)) %>%
ungroup() %>%
write_rds("raw_data/mysim.rds")
# Next, let's gather first differences,
# Calculating the difference in median distance squared
# for a building where pop_white = 4 vs. pop_white = 1
get_fd = function(mysimulation){
data.frame(fd = mysimulation$sim.out$x1$fd %>% unlist(),
x1 = mysimulation$sim.out$x1$ev %>% unlist(),
x = mysimulation$sim.out$x$ev %>% unlist()) %>%
return()
}
# Let's write a quick function to extract first differences
bind_rows(
z1 %>%
sim(., x = setx(., pop_white = 1), x1 = setx1(., pop_white = 4)) %>%
get_fd() %>%
mutate(type = "community"),
z2 %>%
sim(., x = setx(., pop_white = 1), x1 = setx1(., pop_white = 4)) %>%
get_fd() %>%
mutate(type = "worship"),
z3 %>%
sim(., x = setx(., pop_white = 1), x1 = setx1(., pop_white = 4)) %>%
get_fd() %>%
mutate(type = "social"),
z4 %>%
sim(., x = setx(., pop_white = 1), x1 = setx1(., pop_white = 4)) %>%
get_fd() %>%
mutate(type = "parks")) %>%
mutate(type = type %>% recode_factor(
"community" = "Community Spaces",
"worship" = "Places of Worship",
"social" = "Social Businesses",
"parks" = "Parks")) %>%
# Back transform the simulated outcomes
mutate_at(vars(x, x1), list(~sqrt(.)*1000)) %>%
# Recalculate the first differences
mutate(fd = x1 - x) %>%
group_by(type) %>%
summarize(
estimate = quantile(fd, probs = 0.50),
lower_ci = quantile(fd, probs = 0.025),
upper_ci = quantile(fd, probs = 0.975),
# Actual simulated SE
sd = sd(fd, na.rm = TRUE),
# Approximated from confidence interval,
# assuming normal distribution
se = (upper_ci - lower_ci) / (2*1.96),
# se and sd are very close, so let's use sd, which is more accurate
z = estimate / sd,
p = exp(-0.717*z - 0.416*z^2),
stars = gtools::stars.pval(p),
estimate_label = round(estimate, 1),
estimate_label = if_else(estimate > 0,
true = paste("+", estimate_label, sep = ""),
false = paste(estimate_label)),
diff_label = paste("Change: ", estimate_label, stars, sep = "")) %>%
write_rds("raw_data/myfd.rds")
rm(z1,z2,z3,z4, mydat)library(viridis)
mysim <- read_rds("raw_data/mysim.rds")
myfd <- read_rds("raw_data/myfd.rds")
ggplot() +
geom_crossbar(data = mysim,
mapping = aes(x = median * 100, y = estimate,
ymin = lower_ci, ymax = upper_ci,
color = type, fill = type),
color = "#373737", alpha = 0.5) +
geom_text(data = mysim, mapping = aes(x = median * 100, y = estimate,
label = round(estimate, 0)),
nudge_y = 15) +
geom_text(data = myfd,
mapping = aes(x = 50, y = 760, label = diff_label), color = "#373737") +
facet_wrap(~type, ncol = 2) +
guides(fill = "none", color = "none") +
theme_classic(base_size = 14) +
labs(x = "% White Residents in surrounding 1 sq.km. cell",
y = "Median Distance (m) to Nearby Social Infrastructure\n(within 1 km buffer)",
caption = "Numbers show expected median distance (m), with 95% simulated confidence intervals.\nEstimates placed on x axis according to quartile for share of White residents,\nlocated at quartile midpoints.\nQ1 (First quartile) = 6~35% White. Q2 = 35-29%. Q3 = 59-72%. Q4 = 72-93%.",
subtitle = "Race vs. Access to Social Infrastructure") +
scale_x_continuous(breaks = c(0, 25, 50, 75, 100), limits = c(0, 100),
expand = expansion(c(0,0))) +
scale_fill_viridis(discrete = TRUE, option = "plasma", begin = 0, end = 0.6) +
scale_color_viridis(discrete = TRUE, option = "plasma", begin = 0, end = 0.6) +
theme(panel.border = element_rect(fill = NA, color = "#373737"),
panel.spacing.x = unit(0.75, "cm"),
axis.line = element_blank(),
axis.ticks = element_blank(),
strip.background = element_blank(),
plot.subtitle = element_text(hjust = 0.5),
plot.caption = element_text(hjust = 0),
plot.caption.position = "plot")library(tidyverse)
mytypes <- read_rds("raw_data/buffer_dist.rds") %>%
# There are 275 builings in Boston that are unlabelled. We will just remove them.
filter(nchar(building_id) > 1) %>%
# Collect the following variables, telling us...
# This building in THAT CELL has HOW MANY social infrastructure sites of TYPE X within THAT RADIUS?
select(cell_id = buffer_id, building_id, type, contains("count"))What percentage of social infrastructure sites near that building are of type X?
Need total number of social infrastructure sites near that building
grid <- read_rds("raw_data/grid.rds") %>%
as_tibble() %>%
select(-geometry)
# Per building, how many sites are you proximate too?
mydat <- mytypes %>%
# In each cell,
# on average, how many social businesses are you within 1000 km from?,
# given the total number of buildings in that cell
group_by(cell_id) %>%
summarize(
community = mean(count1000[type == "Community Spaces"], na.rm = TRUE),
social = mean(count1000[type == "Social Businesses"], na.rm = TRUE),
park = mean(count1000[type == "Parks"], na.rm = TRUE),
worship = mean(count1000[type == "Places of Worship"], na.rm = TRUE)) %>%
left_join(by = "cell_id", y = grid) %>%
# Impute second smallest value
mutate_at(vars(community, social, park, worship),
list(~if_else(. == 0, sort(unique(.))[2]/2, .))) %>%
mutate_at(vars(community, social, park, worship),
list(~log(.)))
# What kinds of places end up with closer access to social infrastructure?
m1 <- mydat %>%
lm(formula = community ~ pop_density_int + pop_black + pop_hisplat + pop_asian +
pop_some_college + median_income + income_inequality +
median_monthly_housing_cost)
m2 <- mydat %>%
lm(formula = worship ~ pop_density_int + pop_black + pop_hisplat + pop_asian +
pop_some_college + median_income + income_inequality +
median_monthly_housing_cost)
m3 <- mydat %>%
lm(formula = social ~ pop_density_int + pop_black + pop_hisplat + pop_asian +
pop_some_college + median_income + income_inequality +
median_monthly_housing_cost)
m4 <- mydat %>%
lm(formula = park ~ pop_density_int + pop_black + pop_hisplat + pop_asian +
pop_some_college + median_income + income_inequality +
median_monthly_housing_cost)
texreg::screenreg(list(m1,m2,m3,m4))##
## ======================================================================
## Model 1 Model 2 Model 3 Model 4
## ----------------------------------------------------------------------
## (Intercept) -0.48 0.92 1.55 -3.41
## (3.10) (2.08) (2.41) (2.49)
## pop_density_int 0.00 0.00 *** 0.00 *** 0.00
## (0.00) (0.00) (0.00) (0.00)
## pop_black -1.73 0.34 -1.81 -0.81
## (1.38) (0.94) (1.07) (1.12)
## pop_hisplat -0.56 -2.47 -2.71 2.24
## (1.87) (1.29) (1.46) (1.52)
## pop_asian -4.08 -1.41 1.45 8.76 *
## (4.31) (3.00) (3.45) (3.61)
## pop_some_college 2.68 6.70 12.55 * 2.52
## (6.35) (4.37) (4.95) (5.19)
## median_income -0.00 0.00 -0.00 0.00
## (0.00) (0.00) (0.00) (0.00)
## income_inequality -1.19 -4.02 -12.73 *** 0.92
## (4.47) (3.02) (3.49) (3.61)
## median_monthly_housing_cost 0.00 -0.00 0.00 ** 0.00
## (0.00) (0.00) (0.00) (0.00)
## ----------------------------------------------------------------------
## R^2 0.06 0.20 0.19 0.18
## Adj. R^2 0.00 0.16 0.15 0.14
## Num. obs. 155 163 163 165
## ======================================================================
## *** p < 0.001; ** p < 0.01; * p < 0.05
read_rds("raw_data/buffer_dist.rds") %>%
# There are 275 builings in Boston that are unlabelled. We will just remove them.
filter(nchar(building_id) > 1) %>%
# Collect the following variables, telling us...
# This building in THAT CELL was a median of this far away from social infrastructure sites of TYPE X within THAT RADIUS?
select(cell_id = buffer_id, building_id, type, dist = dist1000) %>%
ggplot(mapping = aes(x = type, y = dist)) +
geom_boxplot()library(tidyverse)
library(sf)
mytest <- mytypes %>%
select(cell_id, building_id, type, count = count1000) %>%
pivot_wider(id_cols = c(cell_id, building_id), names_from = type, values_from = count) %>%
# Filter to just buildings that were within that threshold of ANY social infrastructure
filter(`Community Spaces` != 0 |
`Social Businesses` != 0 |
`Places of Worship` != 0 |
`Parks` != 0) %>%
mutate(total = `Community Spaces` + `Social Businesses` + `Places of Worship` + `Parks`) %>%
mutate_at(vars(`Community Spaces`:`Social Businesses`), list(~./total))