Disclaimer: Some parts of this document, especially the introduction, come from Working with Web Data in R of Datacamp.com. This document is prepared for CP6521 Advanced GIS, a graduate-level city planning elective course at Georgia Tech in Spring 2019. For any question, contact the instructor, Yongsung Lee, Ph.D. via yongsung.lee(at)gatech.edu.
This document is also published on RPubs.
install.packages("tidyverse", dependencies = TRUE)
install.packages("sf", dependencies = TRUE)
install.packages("raster", dependencies = TRUE)
install.packages("sp", dependencies = TRUE)
install.packages("tmap", dependencies = TRUE)

install.packages("tidycensus", dependencies = TRUE)
install.packages("tigris", dependencies = TRUE)
install.packages("devtools")
devtools::install_github("jamgreen/lehdr")

install.packages("httr", dependencies = TRUE)
install.packages("jsonlite", dependencies = TRUE)
install.packages("viridis", dependencies = TRUE)
install.packages("ggpubr", dependencies = TRUE)
library(tidyverse)
library(sf)
library(raster)
library(sp)
library(tmap)

library(tidycensus)
library(tigris)
tigris_cache_dir(paste0(getwd(), "/tigris"))
readRenviron('~/.Renviron')
library(lehdr)

library(httr) 
library(jsonlite)
library(viridis)
library(ggpubr)

Learning Objectives

  1. Learn about Application Programming Interfaces (API)
  2. Learn the json format
  3. Learn how to parse data in the json format
  4. Practice with Google Places API

Theories

Application Programming Interfaces
  • ‘websites, but for machines’
  • Can be used to expose data automatically
  • Lets you make queries for specific bits of that data
API Clients
  • Native (in R!) interfaces to APIs
  • Hides API complexity
  • Lets you read data in as R objects
Using API Clients
API etiquette
  • Overwhelming the API means you can’t use it
  • Overwhelming the API means nobody else can use it
  • APIs issue ‘access token’ to control and identify use
HTTP requests
  • Conversation between your machine and the server
  • First: what you want to happen
  • “methods” - different requests for different tasks
  • GET: “get me something”
response <- GET(url = "https://httpbin.org/get")
content(response)
Error handling
  • Every response includes a HTTP status code
  • Code starts with:
    • 2 - great!
    • 3 - great!
    • 4 - your code is broken
    • 5 - their code is broken
    • REFERENCE
  • Check for bad codes with http_error()
URL construction
  • Most of URL doesn’t change
  • Stitch URLs together from bits that don’t change with the bits that do
  • Saves thinking and typing
Parameter-based URLs (e.g., Google API)
User agents
  • Bits of text that ID your browser (or software)
  • Gives the server some idea of what you’re trying to do
  • You can set one with your requests with user_agent()
  • Add an email address so they can contact you
server_response <- GET(url, user_agent("my@email.address this is a test"))
Rate limiting
  • Too many requests makes for a sad server
  • Deliberately slows down your code to keep under a desired ‘rate’ (e.g., Sys.sleep(2))
  • Slows you, but avoids getting you banned from the server
JSON (JavaScript Obejct Notation)
  • Plain text format
  • Two structures:
    • objects: {"title":"A New Hope", "year":"1977"}
    • arrays: [1977, 1980]
  • Values: “string”, 3, TRUE, FALSE, NULL, or another object or array
library(httr)
url <- paste0(url_1, lat_lng[[1]], url_2, business, "&key=", my_api)
r <- GET(url) 
http_type(r)
# [1] "application/json"
writeLines(content(r, as = "text"))
# ?writeLines
Manipulating JSON
  • By jsonlite::fromJSON

Let’s practice!

Prepare the key input and several parameters.

How do we know the right format of the URL? Goggle provides a very good documetion.

# read in temp4.rds file 
lat_lng <- read_rds("YOUR FILEPATH GOES HERE")

url_1 <- "https://maps.googleapis.com/maps/api/place/nearbysearch/json?location="
url_2 <- "&rankby=distance&type="
business <- "cafe"
my_api <- "YOUR API KEY GOES HERE"
token <- "&pagetoken="

Before defining a fully functioning function, let’s see its basic structure.

# Send a request, GET the response to it, and read in as an R object.  
result1_raw <<-
  paste0(url_1, lat_lng[[1]], url_2, business, "&key=", my_api) %>% 
  GET() %>% 
  content(as = "text") %>%
  fromJSON() 

result1_df <<-
  result1_raw$results %>% 
  dplyr::select(name, place_id, types, rating, user_ratings_total) 
  result1_df$x <<- result1_raw$results$geometry$location$lng
  result1_df$y <<- result1_raw$results$geometry$location$lat

Now, let’s build a full function that handles possible errors.

google_places <- function(lat_lng){
  
  # read in output from Google 
  result1_raw <<-
    paste0(url_1, lat_lng, url_2, business, "&key=", my_api) %>% 
    GET() %>% 
    content(as = "text") %>%
    fromJSON() 
  # transform to a data frame 
  if (result1_raw$status == "OK"){
    result1_df <<- 
      result1_raw$results %>% 
      dplyr::select(name, place_id, types, rating, user_ratings_total) 
    result1_df$x <<- result1_raw$results$geometry$location$lng
    result1_df$y <<- result1_raw$results$geometry$location$lat
    # result1_df$geometry <- NULL
  } else {
    result1_df <<- NA
  }
  print("Result #1 is just finished.")
  Sys.sleep(2)
  
  # run if results have more than 20 businesses 
  if (is.null(result1_raw$next_page_token) == FALSE){
    # read in output from Google 
    result2_raw <<-
      paste0(url_1, lat_lng, url_2, business, "&key=", my_api, token, result1_raw$next_page_token) %>% 
      GET() %>% 
      content(as = "text") %>%
      fromJSON() 
    # transform to a data frame 
    if (result2_raw$status == "OK"){
      result2_df <<- 
        result2_raw$results %>% 
        dplyr::select(name, place_id, types, rating, user_ratings_total) 
      result2_df$x <<- result2_raw$results$geometry$location$lng
      result2_df$y <<- result2_raw$results$geometry$location$lat
      # result2_df$geometry <- NULL
    } else {
      result2_df <<- NA
    }
  } else {
    result2_df <<- NA
  }
  print("Result #2 is just finished.")
  Sys.sleep(2)
  
  # run if results have more than 40 businesses 
  if (is.null(result2_raw$next_page_token) == FALSE){
    # read in output from Google 
    result3_raw <<-
      paste0(url_1, lat_lng, url_2, business, "&key=", my_api, token, result2_raw$next_page_token) %>% 
      GET() %>% 
      content(as = "text") %>%
      fromJSON() 
    # transform to a data frame 
    if (result3_raw$status == "OK"){
      result3_df <<- 
        result3_raw$results %>% 
        dplyr::select(name, place_id, types, rating, user_ratings_total) 
      result3_df$x <<- result3_raw$results$geometry$location$lng
      result3_df$y <<- result3_raw$results$geometry$location$lat
      # result3_df$geometry <- NULL
    } else {
      result3_df <<- NA
    }
  } else {
    result3_df <<- NA
  }
  print("Result #3 is just finished.")
  
  result_invalid <<- list(result1_raw$status, result2_raw$status, result3_raw$status) %>%
    map_lgl(~.=="INVALID_REQUEST")
  if (any(result_invalid)){
    print("INVALID_REQUEST")
    return("INVALID_REQUEST")
  } else {
    result_list  <<- list(result1_df, result2_df, result3_df)
    result_final <<- result_list[!is.na(result_list)] %>% 
      dplyr::bind_rows()
    print("Succesful collection")
    return(result_final)
  }
  Sys.sleep(2)
}

Let’s run a for loop with the above function.

# make an empty ouput list 
output <- vector("list", length(lat_lng))

# run a for loop 
for (i in seq_along(lat_lng)) {
  output[[i]] <- google_places(lat_lng[[i]])  
}

# after finishing the for loop, check if there is any error 
map(output, ~class(.)!="data.frame") %>% unlist() %>% sum()

# if there is no error, then move on to the next step, data cleaning 
# since the output is a list, each element of which is a data frame. Let's first bind all data frames into one. 
output_df <- dplyr::bind_rows(output) %>% as_tibble()

# Next, let's remove duplicates
output_df2 <- 
  output_df %>%  
  group_by(place_id) %>%
  summarize(rating = mean(rating, na.rm = TRUE), 
            rating_n = mean(user_ratings_total, na.rm = TRUE), 
            x = mean(x), 
            y = mean(y)) 

# convert df to sf with crs = 4326 
output_sf <- st_as_sf(output_df2, coords = c("x", "y"), crs = 4326)

# Compute a new variable ln_rating_n (Log of # of reviewers)  
output_sf$ln_rating_n <- log(output_sf$rating_n +1)

With tmap, check the final outcome.

inner_points <- ("YOUR FILEPATH GOES HERE")
cbd_bf <- ("YOUR FILEPATH GOES HERE") 

tmap_mode("view")
tm_basemap("Stamen.TonerLite", alpha = 0.25) +
  tm_shape(output_sf) +
  tm_dots(col = "ln_rating_n", size = 0.05, palette = "-viridis") +
  tm_shape(inner_points) +
  tm_dots(col = "red", size = 0.01, alpha = 0.25) + 
  tm_shape(cbd_bf) +
  tm_borders(col = "red") + 
  tm_tiles("Stamen.TonerLabels")