2nd Mini-project Answers

library(tidyverse)
library(sf)          
library(raster)
library(tmap)

library(tidycensus)
library(tigris)
library(lehdr)

library(httr) 
library(jsonlite)
library(viridis)
library(ggpubr)

# insert your Google Places API key to your environment 
census_api_key("YOUR API GOES HERE", install = TRUE, overwrite = TRUE)
readRenviron("~/.Renviron") # or, Restart R

# specify the folder to save tiger/line shapefiles 
tigris_cache_dir(paste0(getwd(), "/tigris"))
readRenviron('~/.Renviron') # or, Restart R

1. Atlanta BeltLine and Changes in Local Economies

Step 1. Import an official BeltLine shapefile as an sf object: line

getwd() # put the shapefile in your working directory 
beltline <- st_read("Export_Output.shp")
class(beltline)

east_trail <- beltline %>%
  filter(Status == "Completed", Name == "Eastside Trail")

Step 4. Read in the 2006-2010 & 2013-2017 ACS 5 year estimates at the tract level in Fulton County

tract_fulton <- tracts(state = "13", county = "121", cb = TRUE) %>% st_as_sf %>% dplyr::select(GEOID)

ses2010 <- get_acs(state = "13", county = "Fulton", geography = "tract", year = 2010, 
                   variables = c("B01001_001", "B01001H_001", 
                                 "B15002_001", "B15002_015", "B15002_016", "B15002_017", "B15002_018", 
                                 "B15002_032", "B15002_033", "B15002_034", "B15002_035", 
                                 "B19013_001"), output = "wide", geometry = FALSE) 
ses2010b <- ses2010 %>%
  mutate(pct_nhw = B01001H_001E/B01001_001E*100, 
         pct_coll = (B15002_015E + B15002_016E + B15002_017E + B15002_018E + 
                     B15002_032E + B15002_033E + B15002_034E + B15002_035E)/B15002_001E*100, 
         ln_medinc = log(B19013_001E)
         ) %>%
  mutate(pct_nhw2010 = scale(pct_nhw)[, 1], 
         pct_coll2010 = scale(pct_coll)[, 1], 
         ln_medinc2010 = scale(ln_medinc)[, 1], 
         ses2010 = (pct_nhw2010 + pct_coll2010 + ln_medinc2010)/3) %>%
  dplyr::select(GEOID, pct_nhw2010, pct_coll2010, ln_medinc2010, ses2010) 

ses2017 <- get_acs(state = "13", county = "Fulton", geography = "tract", year = 2017, 
                   variables = c("B01001_001", "B01001H_001", 
                                 "B15002_001", "B15002_015", "B15002_016", "B15002_017", "B15002_018", 
                                 "B15002_032", "B15002_033", "B15002_034", "B15002_035", 
                                 "B19013_001"), output = "wide", geometry = FALSE)
ses2017b <- ses2017 %>%
  mutate(pct_nhw = B01001H_001E/B01001_001E*100, 
         pct_coll = (B15002_015E + B15002_016E + B15002_017E + B15002_018E + 
                     B15002_032E + B15002_033E + B15002_034E + B15002_035E)/B15002_001E*100, 
         ln_medinc = log(B19013_001E)
         ) %>%
  mutate(pct_nhw2017 = scale(pct_nhw)[, 1], 
         pct_coll2017 = scale(pct_coll)[, 1], 
         ln_medinc2017 = scale(ln_medinc)[, 1], 
         ses2017 = (pct_nhw2017 + pct_coll2017 + ln_medinc2017)/3) %>%
  dplyr::select(GEOID, pct_nhw2017, pct_coll2017, ln_medinc2017, ses2017) 

tract_ses <- tract_fulton %>%
  left_join(ses2010b, by = "GEOID") %>%
  left_join(ses2017b, by = "GEOID") %>%
  dplyr::select(GEOID, ses2010, ses2017)

Step 7. Compare patterns of the SES index between 2006-2010 and 2013-2017

st_crs(east_trail)
st_crs(tract_ses)

tract_ses_P <- st_transform(tract_ses, 3857)
st_crs(tract_ses_P)

distances_tr <- st_distance(tract_ses_P, east_trail)
tract_ses_P$dist <- distances_tr[, 1]

ggplot(tract_ses_P) + 
  geom_point(aes(x = as.numeric(dist), y = ses2010), color = "grey") +
  geom_smooth(aes(x = as.numeric(dist), y = ses2010), color = "grey", se = FALSE) +
  geom_point(aes(x = as.numeric(dist), y = ses2017)) +
  geom_smooth(aes(x = as.numeric(dist), y = ses2017), se = FALSE)

ggplot(tract_ses_P)+
  geom_sf(aes(fill = as.numeric(dist)), colour = "white")

2. Transit Catchment Areas in the United States

Step 1. Download tod_database_download.csv on https://toddata.cnt.org/downloads.php.

stations_df <- read_csv("tod_database_download.csv") 
stations_sf <- st_as_sf(stations_df, coords = c("Longitude", "Latitude"), crs = 4326)

Step 7 Compute the same summary statistics for the entire country with the same data sets.

Create half-mile buffers for individual stations.

stations_sf <- stations_sf %>%
  filter(Buffer == "Existing Transit") %>%
  st_transform(2163) # projection that preserves areas for the contiguous US 

bf_all <- st_buffer(stations_sf, dist = 800)

Create a list of counties that overlap any buffer.

counties <- counties() %>% st_as_sf() %>% st_transform(2163) # all US counties - 3233 
counties_tr <- counties[bf_all, ] # US counties overlapped by any transit station buffer - 191 
counties_tr_df <- counties_tr %>%
  mutate(FIPS = paste0(STATEFP, COUNTYFP)) %>%
  arrange(FIPS) %>%
  dplyr::select(FIPS)
# nrow(counties_tr_df) == n_distinct(counties_tr_df) # check any duplicate 
counties_tr_list <- counties_tr_df$FIPS # create a vector

Define a function that extracts # of commuters using public transit, biking, and waking at the block group level.

get_wrk_cnt <- function(x){
  tryCatch(
    {get_acs(state = substr(x, 1, 2), county = substr(x, 3, 5), 
                 geography = "block group", year = 2017, 
                 variables = c("B08301_001", "B08301_010", "B08301_018", "B08301_019"), 
                 output = "wide", geometry = TRUE)
    }, 
    error = function(cond){
     message("Here's the original error message:")
      message(cond)
      return(NA)
    }
  )
}

Apply the function get_wrk_cnt only to those counties that overlap any buffer.

# a <- Sys.time() # start time 
output <- map(counties_tr_list, get_wrk_cnt) 
# b <- Sys.time() # end time 
# b - a           # processing time: 7.708619 mins

Check if there is any API error. If necessary, apply the function to those counties with the error.

## check the output & save for later use 
map_lgl(output, function(x) class(x)[1] %in% c("sf", "data.frame"))
map_lgl(output, ~class(.)[1] %in% c("sf", "data.frame")) # this does the same job as the above line. 

object.size(output) # about 120 mb
map_dbl(output, nrow) %>% sum() # total number of blkgroups

For later use, save the outcome.

write_rds(output, "wrk_cnt_blkgrp2017.rds") # save it as an R data set 
output <- read_rds("wrk_cnt_blkgrp2017.rds")    # read in as an R data set

Row-bind all list elements. Each list element of output is an sf object containing all block groups for each county.

## combining lists of sf objects to single dataframe: https://github.com/r-spatial/sf/issues/798
## FYI, do.call https://www.rdocumentation.org/packages/base/versions/3.5.3/topics/do.call
# a <- Sys.time() # start time
output_bound <- do.call(what = sf:::rbind.sf, args = output) %>% 
  st_transform(crs = 2163)
# b <- Sys.time() # end time
# b - a           # processing time: 37.85005 secs

object.size(output_bound) # about 120 mb, a little smaller than ouput 
plot(st_geometry(output_bound)) # this takes time to process

For later use, save the outcome.

write_rds(output_bound, "wrk_cnt_blkgrp2017_bound.rds") # save it as an R data set 
output_bound <- read_rds("wrk_cnt_blkgrp2017_bound.rds")    # read in as an R data set

Spatially subset blockgroups that overlap any buffer.

blkgrp_all <- output_bound[bf_all, ]
blkgrp_all$area_whole <- blkgrp_all %>% st_area() %>% unclass()

Create segments of blockgroups that fall into each buffer.

seg_all <- st_intersection(bf_all, blkgrp_all)
seg_all$area_seg <- seg_all %>% st_area() %>% unclass()
seg_all$area_prop <- seg_all$area_seg / seg_all$area_whole

Union those segements for each buffer.
Summarize commute mode information for each station.

union_all <- 
  seg_all %>% 
  group_by(Station.Name) %>% 
  mutate(workers = B08301_001E * area_prop, 
         workers_tr = B08301_010E * area_prop, 
         workers_bk = B08301_018E * area_prop,
         workers_wk = B08301_019E * area_prop
         ) %>%
  summarize(n_seg = n(), 
            workers_sum = sum(workers), 
            workers_sum_tr = sum(workers_tr),
            workers_sum_bk = sum(workers_bk),
            workers_sum_wk = sum(workers_wk), 
            workers_pct_tr = workers_sum_tr/workers_sum*100, 
            workers_pct_active = (workers_sum_bk + workers_sum_bk)/workers_sum*100) %>%
  arrange(desc(n_seg))

3. Measuring Consumption Amenities

Step 1 Create xy points for API calls.

Download the Urban Area bounary and convert to sf to use tidyverse verbs

ua00 <- urban_areas(cb = FALSE, year = 2017) # https://cran.r-project.org/web/packages/tigris/tigris.pdf
ua01 <- ua00 %>% st_as_sf()

Subset Boston UA & San Francisco UA.

boston_ua <- ua01 %>%
  filter(grepl("Boston, MA", NAME10)) %>%
  st_transform(crs = 2163) # projection that preserves areas for the contiguous US 

sanf_ua <- ua01 %>%
  filter(grepl("San Francisco", NAME10)) %>%
  st_transform(crs = 2163) # projection that preserves areas for the contiguous US

Convert two UAs to raster objects with resolution of 800 meters

boston_raster_template  <- 
  raster(extent(boston_ua), 
         resolution = 800, 
         crs = st_crs(boston_ua)$proj4string) # http://strimas.com/spatial/hexagonal-grids/
boston_r <- 
  rasterize(boston_ua, boston_raster_template, field = 1)

sanf_raster_template  <- 
  raster(extent(sanf_ua), 
         resolution = 800, 
         crs = st_crs(sanf_ua)$proj4string) # http://strimas.com/spatial/hexagonal-grids/
sanf_r <- 
  rasterize(sanf_ua, sanf_raster_template, field = 1)

Convert two raster objects to polygons (i.e., fishnet)

boston_fishnet <- rasterToPolygons(boston_r) %>%
  st_as_sf()  

sanf_fishnet <- rasterToPolygons(sanf_r) %>%
  st_as_sf()

Convert the polygons to their centroids.

boston_fishnet_cent <- st_centroid(boston_fishnet)
sanf_fishnet_cent   <- st_centroid(sanf_fishnet)

Identify the CBD in Boston and San Francisco.
For Boston, extract the xy coordinates of the census tract with the hightest job density

# number of jobs for individual tracts in MA 
ma_job <- grab_lodes(state = "MA", year = 2015, lodes_type = "wac",
                     job_type = "JT00", segment = "S000", agg_geo = "tract",
                     download_dir = file.path(getwd(), "lodes_raw")) # specify the folder to download

# tracts in Boston UA 
ma_tr <- tracts(state = "MA", cb = TRUE) 
ma_tr_sf <- ma_tr %>% st_as_sf() %>% st_transform(crs=2163)
boston_tr_sf <- ma_tr_sf[boston_ua, ]

# calculate job density for individual tracts 
# extract the xy coordinates of the tract with the hightest density 
boston_tr_sf %>%
  left_join(ma_job, by = c("GEOID" = "w_tract")) %>%
  mutate(ALAND2 = as.numeric(ALAND),
         jobden = C000/ALAND2, 0) %>%
  arrange(desc(jobden)) %>% 
  top_n(jobden, n=1) %>%
  dplyr::select(GEOID) %>%
  st_centroid()

For San Francisco, extract the xy coordinates of the census tract with the hightest job density

# number of jobs for individual tracts in CA 
ca_job <- grab_lodes(state = "CA", year = 2015, lodes_type = "wac",
                     job_type = "JT00", segment = "S000", agg_geo = "tract",
                     download_dir = file.path(getwd(), "lodes_raw")) # specify the folder to download

# tracts in San Francisco UA 
ca_tr <- tracts(state = "CA", cb = TRUE) 
ca_tr_sf <- ca_tr %>% st_as_sf() %>% st_transform(crs=2163)
sanf_tr_sf <- ca_tr_sf[sanf_ua, ]

# calculate job density for individual tracts 
# extract the xy coordinates of the tract with the hightest density 
sanf_tr_sf %>%
  left_join(ca_job, by = c("GEOID" = "w_tract")) %>%
  mutate(ALAND2 = as.numeric(ALAND),
         jobden = C000/ALAND2, 0) %>%
  arrange(desc(jobden)) %>% 
  top_n(jobden, n=1) %>%
  dplyr::select(GEOID) %>%
  st_centroid()

Subset those fishnet points that are within five miles from the CBD in Boston and San Francisco.

# For Boston 
boston_cbd_point    <- c(2317225, 124512.1) %>% st_point()   
boston_cbd          <- st_sfc(boston_cbd_point, crs = 2163)   
boston_cbd_bf       <- boston_cbd %>% st_buffer(dist = 8046.72) # create a 5-mile buffer  
boston_pts          <- boston_fishnet_cent[boston_cbd_bf, ]

tmap_mode("view")
tm_basemap("OpenStreetMap.Mapnik") +
  tm_shape(boston_pts) +
  tm_dots()

# For San Francisco
sanf_cbd_point    <- c(-1943352, -537613.2) %>% st_point()   
sanf_cbd          <- st_sfc(sanf_cbd_point, crs = 2163)   
sanf_cbd_bf       <- sanf_cbd %>% st_buffer(dist = 8046.72) # create a 5-mile buffer  
sanf_pts          <- sanf_fishnet_cent[sanf_cbd_bf, ]

tmap_mode("view")
tm_basemap("OpenStreetMap.Mapnik") +
tm_shape(sanf_pts) +
  tm_dots()

Extract xy coordinates in Google API’s epsg
Reproject with crs=4326 and extract xy coordinates

boston_pts_wgs84 <- 
  boston_pts %>% 
  st_transform(crs = 4326) %>% 
  st_coordinates() %>% 
  as.data.frame()

sanf_pts_wgs84 <- 
  sanf_pts %>% 
  st_transform(crs = 4326) %>% 
  st_coordinates() %>% 
  as.data.frame()

Step 2 Send Google Places API calls.

Prepare URLs for API calls

boston_lat_lng <- paste(boston_pts_wgs84$Y, boston_pts_wgs84$X, sep=",")
sanf_lat_lng <- paste(sanf_pts_wgs84$Y, sanf_pts_wgs84$X, sep=",")

url_1 <- "https://maps.googleapis.com/maps/api/place/nearbysearch/json?location="
url_2 <- "&rankby=distance&type="
business <- "cafe"
my_api <- "YOUR API KEY GOES HERE"
token <- "&pagetoken="

Define a function

google_places <- function(lat_lng){
  
  # read in output from Google 
  result1_raw <<-
    paste0(url_1, lat_lng, url_2, business, "&key=", my_api) %>% 
    GET() %>% 
    content(as = "text") %>%
    fromJSON() 
  # transform to a data frame 
  if (result1_raw$status == "OK"){
    result1_df <<- 
      result1_raw$results %>% 
      dplyr::select(name, place_id, types, rating, user_ratings_total) 
    result1_df$x <<- result1_raw$results$geometry$location$lng
    result1_df$y <<- result1_raw$results$geometry$location$lat
    # result1_df$geometry <- NULL
  } else {
    result1_df <<- NA
  }
  print("Result #1 is just finished.")
  Sys.sleep(2)
  
  # run if results have more than 20 businesses 
  if (is.null(result1_raw$next_page_token) == FALSE){
    # read in output from Google 
    result2_raw <<-
      paste0(url_1, lat_lng, url_2, business, "&key=", my_api, token, result1_raw$next_page_token) %>% 
      GET() %>% 
      content(as = "text") %>%
      fromJSON() 
    # transform to a data frame 
    if (result2_raw$status == "OK"){
      result2_df <<- 
        result2_raw$results %>% 
        dplyr::select(name, place_id, types, rating, user_ratings_total) 
      result2_df$x <<- result2_raw$results$geometry$location$lng
      result2_df$y <<- result2_raw$results$geometry$location$lat
      # result2_df$geometry <- NULL
    } else {
      result2_df <<- NA
    }
  } else {
    result2_df <<- NA
  }
  print("Result #2 is just finished.")
  Sys.sleep(2)
  
  # run if results have more than 40 businesses 
  if (is.null(result2_raw$next_page_token) == FALSE){
    # read in output from Google 
    result3_raw <<-
      paste0(url_1, lat_lng, url_2, business, "&key=", my_api, token, result2_raw$next_page_token) %>% 
      GET() %>% 
      content(as = "text") %>%
      fromJSON() 
    # transform to a data frame 
    if (result3_raw$status == "OK"){
      result3_df <<- 
        result3_raw$results %>% 
        dplyr::select(name, place_id, types, rating, user_ratings_total) 
      result3_df$x <<- result3_raw$results$geometry$location$lng
      result3_df$y <<- result3_raw$results$geometry$location$lat
      # result3_df$geometry <- NULL
    } else {
      result3_df <<- NA
    }
  } else {
    result3_df <<- NA
  }
  print("Result #3 is just finished.")
  
  result_invalid <<- list(result1_raw$status, result2_raw$status, result3_raw$status) %>%
    map_lgl(~.=="INVALID_REQUEST")
  if (any(result_invalid)){
    print("INVALID_REQUEST")
    return("INVALID_REQUEST")
  } else {
    result_list  <<- list(result1_df, result2_df, result3_df)
    result_final <<- result_list[!is.na(result_list)] %>% 
      dplyr::bind_rows()
    print(paste0("Succesful collection for ", as.character(i), "th point"))
    return(result_final)
  }
  Sys.sleep(2)
}

Make an empty ouput list
Run a for loop
Bind rows and delete duplicates
Convert a df object to an sf object
Map the cafes within 5 miles from the CBD (in fact, cafes outside of the buffers are also scrapped.)

boston_output <- vector("list", length(boston_lat_lng))

for (i in seq_along(boston_lat_lng)) {
  boston_output[[i]] <- google_places(boston_lat_lng[[i]])  
}

map(boston_output, ~class(.)!="data.frame") %>% unlist() %>% sum()
map_dbl(boston_output, nrow) %>% unlist() %>% sum()/60

boston_output_df <- dplyr::bind_rows(boston_output) %>% as_tibble()

boston_output_df2 <- 
  boston_output_df %>%  
  group_by(place_id) %>%
  summarize(rating = mean(rating, na.rm = TRUE), 
            rating_n = mean(user_ratings_total, na.rm = TRUE), 
            x = mean(x), 
            y = mean(y)) 
  
boston_output_sf <- st_as_sf(boston_output_df2, coords = c("x", "y"), crs = 4326)
boston_output_sf$ln_rating_n <- log(boston_output_sf$rating_n +1)

tmap_mode("view")
tm_basemap("Stamen.TonerLite", alpha = 0.25) +
  tm_shape(boston_output_sf) +
  tm_dots(col = "ln_rating_n", size = 0.05, palette = "-viridis") +
  tm_shape(boston_pts) +
  tm_dots(col = "red", size = 0.01, alpha = 0.25) + 
  tm_shape(boston_cbd_bf) +
  tm_borders(col = "red") + 
  tm_tiles("Stamen.TonerLabels")

sanf_output <- vector("list", length(sanf_lat_lng))

for (i in seq_along(sanf_lat_lng)) {
  sanf_output[[i]] <- google_places(sanf_lat_lng[[i]])  
}

map(sanf_output, ~class(.)!="data.frame") %>% unlist() %>% sum()
map_dbl(sanf_output, nrow) %>% unlist() %>% sum()/60

sanf_output_df <- dplyr::bind_rows(sanf_output) %>% as_tibble()

sanf_output_df2 <- 
  sanf_output_df %>%  
  group_by(place_id) %>%
  summarize(rating = mean(rating, na.rm = TRUE), 
            rating_n = mean(user_ratings_total, na.rm = TRUE), 
            x = mean(x), 
            y = mean(y)) 
  
sanf_output_sf <- st_as_sf(sanf_output_df2, coords = c("x", "y"), crs = 4326)
sanf_output_sf$ln_rating_n <- log(sanf_output_sf$rating_n +1)

tmap_mode("view")
tm_basemap("Stamen.TonerLite", alpha = 0.25) +
  tm_shape(sanf_output_sf) +
  tm_dots(col = "ln_rating_n", size = 0.05, palette = "-viridis") +
  tm_shape(sanf_pts) +
  tm_dots(col = "red", size = 0.01, alpha = 0.25) + 
  tm_shape(sanf_cbd_bf) +
  tm_borders(col = "red") + 
  tm_tiles("Stamen.TonerLabels")

Step 3 Compute cafe density for individual tracts

boston_den_cafe <- boston_output_sf %>%
  st_transform(crs = 2163) %>%
  st_join(boston_tr_sf[boston_cbd_bf, ]) %>% # Boston tracts within five miles from the CBD 
  filter(is.na(GEOID) == FALSE) %>% # drop census tracts with no cafe
  mutate(
    sqmt = as.numeric(ALAND), # ALAND is characer
    sqmi = sqmt/1000000*0.386102159  # convert from sq meter to sq mile 
  ) %>% 
  group_by(GEOID) %>%
  summarize(
    n = n(), 
    sqmi_m = mean(sqmi), 
    den_cafe = n/sqmi_m
    ) %>%
  dplyr::select(GEOID, den_cafe)

sanf_den_cafe <- sanf_output_sf %>%
  st_transform(crs = 2163) %>%
  st_join(sanf_tr_sf[sanf_cbd_bf, ]) %>% # Boston tracts within five miles from the CBD 
  filter(is.na(GEOID) == FALSE) %>% # drop census tracts with no cafe
  mutate(
    sqmt = as.numeric(ALAND), # ALAND is characer
    sqmi = sqmt/1000000*0.386102159  # convert from sq meter to sq mile 
  ) %>% 
  group_by(GEOID) %>%
  summarize(
    n = n(), 
    sqmi_m = mean(sqmi), 
    den_cafe = n/sqmi_m
    ) %>%
  dplyr::select(GEOID, den_cafe)

Step 4 Collect an alternative measure

# select only retail jobs 
ma_retail <- ma_job %>%
  dplyr::select(GEOID = w_tract, retail = CNS07)

# compute retail density for individual tracts 
boston_den_retail_sf <- 
  boston_tr_sf[boston_cbd_bf, ] %>% 
  left_join(ma_retail) %>%
  mutate(
    sqmt = as.numeric(ALAND),
    sqmi = sqmt/1000000*0.386102159,  # convert from sq meter to sq mile 
    den_retail = retail/sqmi
  ) %>%
  dplyr::select(GEOID, den_retail)

tmap_mode("view")
tm_shape(boston_cbd_bf) + # the first two lines do not add any line, but set the geographic extend 
  tm_borders(lwd = 0) + 
  tm_shape(boston_den_retail_sf) + 
  tm_polygons(border.col = "white", lwd = 1, 
              col = "den_retail", style = "quantile", n = 10, alpha = 0.5) +
  tm_shape(boston_cbd_bf) + 
  tm_borders(col = "black", alpha = 0.5)

# select only retail jobs 
ca_retail <- ca_job %>%
  dplyr::select(GEOID = w_tract, retail = CNS07)

# compute retail density for individual tracts 
sanf_den_retail_sf <- 
  sanf_tr_sf[sanf_cbd_bf, ] %>% 
  left_join(ca_retail) %>%
  mutate(
    sqmt = as.numeric(ALAND),
    sqmi = sqmt/1000000*0.386102159,  # convert from sq meter to sq mile 
    den_retail = retail/sqmi
  ) %>%
  dplyr::select(GEOID, den_retail)

tmap_mode("view")
tm_shape(sanf_cbd_bf) + # the first two lines do not add any line, but set the geographic extend 
  tm_borders(lwd = 0) + 
  tm_shape(sanf_den_retail_sf) + 
  tm_polygons(border.col = "white", lwd = 1, 
              col = "den_retail", style = "quantile", n = 10, alpha = 0.5) +
  tm_shape(sanf_cbd_bf) + 
  tm_borders(col = "black", alpha = 0.5)

Step 5 Compute college graduates 25-34 as of 2017

For Boston

# var_acs17 <- load_variables(2017, "acs5", cache = TRUE)
# View(var_acs17)
# B15001_017: Male!!25 to 34 years!!Bachelor's degree
# B15001_018: Male!!25 to 34 years!!Graduate or professional degree
# B15001_058: Female!!25 to 34 years!!Bachelor's degree
# B15001_059: Female!!25 to 34 years!!Graduate or professional degree

ma_young <- get_acs(state = "MA", geography = "tract", year = 2017,
                   variables = c("B15001_017", "B15001_018", "B15001_058", "B15001_059"), 
                   output = "wide") 

ma_young2 <- ma_young %>%
  mutate(young = B15001_017E + B15001_018E + B15001_058E + B15001_059E) %>%
  dplyr::select(GEOID, young)

boston_cbd_young <- boston_tr_sf[boston_cbd_bf, ] %>%
  left_join(ma_young2) %>%
  mutate(
    sum_young = sum(young), 
    pct_young = young/sum_young*100 
  ) %>%
  dplyr::select(GEOID, pct_young)

tmap_mode("view")
tm_basemap("Stamen.TonerLite", alpha = 0.25) +
  tm_shape(boston_cbd_bf) + # the first two lines do not add any line, but set the geographic extend 
  tm_borders(lwd = 0) + 
  tm_shape(boston_cbd_young) +
  tm_polygons(border.col = "white", lwd = 1, 
              col = "pct_young", style = "quantile", n = 5, alpha = 0.5) + 
  tm_shape(boston_output_sf) +
  tm_dots(col = "blue", size = 0.03, alpha = 0.25) + 
  tm_shape(boston_cbd_bf) + 
  tm_borders(col = "black", alpha = 0.25)

For San Francisco

ca_young <- get_acs(state = "CA", geography = "tract", year = 2017,
                   variables = c("B15001_017", "B15001_018", "B15001_058", "B15001_059"), 
                   output = "wide") 

ca_young2 <- ca_young %>%
  mutate(young = B15001_017E + B15001_018E + B15001_058E + B15001_059E) %>%
  dplyr::select(GEOID, young)

sanf_cbd_young <- sanf_tr_sf[sanf_cbd_bf, ] %>%
  left_join(ca_young2) %>%
  mutate(
    sum_young = sum(young), 
    pct_young = young/sum_young*100 
  ) %>%
  dplyr::select(GEOID, pct_young)

tmap_mode("view")
tm_basemap("Stamen.TonerLite", alpha = 0.25) +
  tm_shape(sanf_cbd_bf) + # the first two lines do not add any line, but set the geographic extend 
  tm_borders(lwd = 0) + 
  tm_shape(sanf_cbd_young) +
  tm_polygons(border.col = "white", lwd = 1, 
              col = "pct_young", style = "quantile", n = 10, alpha = 0.5) + 
  tm_shape(sanf_output_sf) +
  tm_dots(col = "blue", size = 0.03, alpha = 0.25) + 
  tm_shape(sanf_cbd_bf) + 
  tm_borders(col = "black", alpha = 0.25)

Step 6 Create plots to compare the performance of two measures

Convert sf objects to df objects

# Cafe density from Google Places API: the outcome from Step 3 
boston_den_cafe_df <- boston_den_cafe
st_geometry(boston_den_cafe_df) <- NULL 

sanf_den_cafe_df <- sanf_den_cafe
st_geometry(sanf_den_cafe_df) <- NULL 

# Retail job density from LEHD: the outcome from Step 4 
boston_den_retail_df <- boston_den_retail_sf 
st_geometry(boston_den_retail_df) <- NULL 

sanf_den_retail_df <- sanf_den_retail_sf 
st_geometry(sanf_den_retail_df) <- NULL 

# College graduate 25-34 from ACS 2017: the outcome from Step 5 
boston_cbd_young_df <- boston_cbd_young
st_geometry(boston_cbd_young_df) <- NULL 

sanf_cbd_young_df  <- sanf_cbd_young
st_geometry(sanf_cbd_young_df) <- NULL

Join three inputs
Process three key variables

data_boston <- left_join(boston_den_retail_df, boston_den_cafe_df) %>% 
  mutate(
    den_retail = ifelse(is.na(den_retail)== TRUE, 0, den_retail), 
    ln_retail = log(den_retail+1), 
    ln_retail = scale(ln_retail), 
    den_cafe = ifelse(is.na(den_cafe)== TRUE, 0, den_cafe), 
    ln_cafe = log(den_cafe+1), 
    ln_cafe = scale(ln_cafe)
  ) %>% 
  left_join(boston_cbd_young_df)

# https://stackoverflow.com/questions/7549694/adding-regression-line-equation-and-r2-on-graph
lm_eqn <- function(df, y, x){
    m <- lm(y ~ x, df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(coef(m)[1], digits = 3),
              b = format(coef(m)[2], digits = 3),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

a <- lm_eqn(data_boston, data_boston$pct_young, data_boston$ln_retail) 
b <- lm_eqn(data_boston, data_boston$pct_young, data_boston$ln_cafe)  

lm_retail_boston <- ggplot(data_boston) + 
  # stat_smooth_func(aes(ln_retail, pct_young), geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(aes(ln_retail, pct_young), method="lm",se=FALSE, color = "coral1", size = 0.1) +
  geom_point(aes(ln_retail, pct_young), color = "coral4") + 
  coord_cartesian(xlim = c(-3, 3), ylim = c(0, 1.25)) + 
  geom_text(x = -1, y = 1.2, label = a, parse = TRUE)

lm_cafe_boston <- ggplot(data_boston) + 
  # stat_smooth_func(aes(ln_cafe, pct_young), geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(aes(ln_cafe, pct_young), method="lm",se=FALSE, color = "deepskyblue1", size = 0.1) +
  geom_point(aes(ln_cafe, pct_young), color = "deepskyblue4") + 
  coord_cartesian(xlim = c(-3, 3), ylim = c(0, 1.25)) + 
  geom_text(x = -1, y = 1.2, label = b, parse = TRUE)

ggarrange(lm_retail_boston, lm_cafe_boston, 
          ncol = 2, nrow = 1)

data_sanf <- left_join(sanf_den_retail_df, sanf_den_cafe_df) %>% 
  mutate(
    den_retail = ifelse(is.na(den_retail)== TRUE, 0, den_retail), 
    ln_retail = log(den_retail+1), 
    ln_retail = scale(ln_retail), 
    den_cafe = ifelse(is.na(den_cafe)== TRUE, 0, den_cafe), 
    ln_cafe = log(den_cafe+1), 
    ln_cafe = scale(ln_cafe)
  ) %>% 
  left_join(sanf_cbd_young_df)

# https://stackoverflow.com/questions/7549694/adding-regression-line-equation-and-r2-on-graph
lm_eqn <- function(df, y, x){
    m <- lm(y ~ x, df);
    eq <- substitute(italic(y) == a + b %.% italic(x)*","~~italic(r)^2~"="~r2, 
         list(a = format(coef(m)[1], digits = 3),
              b = format(coef(m)[2], digits = 3),
             r2 = format(summary(m)$r.squared, digits = 3)))
    as.character(as.expression(eq));
}

a <- lm_eqn(data_sanf, data_sanf$pct_young, data_sanf$ln_retail) 
b <- lm_eqn(data_sanf, data_sanf$pct_young, data_sanf$ln_cafe)  

lm_retail_sanf <- ggplot(data_sanf) + 
  # stat_smooth_func(aes(ln_retail, pct_young), geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(aes(ln_retail, pct_young), method="lm",se=FALSE, color = "coral1", size = 0.1) +
  geom_point(aes(ln_retail, pct_young), color = "coral4") + 
  coord_cartesian(xlim = c(-3, 3), ylim = c(0, 1.25)) + 
  geom_text(x = -1, y = 1.2, label = a, parse = TRUE)

lm_cafe_sanf <- ggplot(data_sanf) + 
  # stat_smooth_func(aes(ln_cafe, pct_young), geom="text",method="lm",hjust=0,parse=TRUE) +
  geom_smooth(aes(ln_cafe, pct_young), method="lm",se=FALSE, color = "deepskyblue1", size = 0.1) +
  geom_point(aes(ln_cafe, pct_young), color = "deepskyblue4") + 
  coord_cartesian(xlim = c(-3, 3), ylim = c(0, 1.25)) + 
  geom_text(x = -1, y = 1.2, label = b, parse = TRUE)

ggarrange(lm_retail_sanf, lm_cafe_sanf, 
          ncol = 2, nrow = 1)

2nd Mini-project Answers

April 1, 2019 (Monday)

1. Atlanta BeltLine and Changes in Local Economies

2. Transit Catchment Areas in the United States

3. Measuring Consumption Amenities