This assignment explores the process of using web APIs to aquire data and then load that data into an R dataframe.
# load necessary supporting packages
library(httr)
library(plyr)
library(dplyr)
# get the parts of the api request
url_base <-'http://api.nytimes.com/svc/semantic/v2/geocodes/'
query_header <- '/query.json?'
query_condition1 <- 'country_code=US'
api_key_header <- "api_key="
api_key <- readLines('C:/Users/cbailey/Desktop/CUNY MSDS/607 Data Aquisition and Management/Week9/nytimes_api_key.csv')
# construct api request and stored the results
response <- GET(paste(url_base
,query_header
,query_condition1
,'&'
,api_key_header
,api_key
,sep = ''))
# parse the api results
response_parsed <- content(response, "parse")
# element 4 of the parsed results contains the data of interest
# unlist element 4, which produces a list of lists
unlisted <- sapply(response_parsed[[4]], unlist)
# transpose each of the second-layer lists
unlisted_trans <- lapply(unlisted, t)
# load each of the transposed second-layer lists as dataframes
dfs <- lapply(unlisted_trans , data.frame , stringsAsFactors = FALSE)
# bind the individual dataframes (records) together as a single dataframe
# filling in any missing values with NA
df_response <- rbind.fill(dfs)
#check names of the database columns
names(df_response)
## [1] "concept_id" "concept_name" "geocode_id"
## [4] "geoname_id" "name" "latitude"
## [7] "longitude" "elevation" "population"
## [10] "country_code" "country_name" "admin_code1"
## [13] "admin_code2" "admin_name1" "admin_name2"
## [16] "feature_class" "feature_code" "feature_code_name"
## [19] "time_zone_id" "dst_offset" "gmt_offset"
## [22] "geocodes_created" "geocodes_updated"
#tidy the data
df_response_slim <- df_response %>%
mutate(longitude = as.numeric(longitude)
,latitude = as.numeric(latitude)
,population = as.numeric(population)) %>%
rename(state_code = admin_code1) %>%
select (state_code, name, latitude, longitude)
df_response_slim
## state_code name latitude longitude
## 1 VA Charlottesville 38.02931 -78.47668
## 2 PA Philadelphia 39.95233 -75.16379
## 3 CO San Juan National Forest 37.69166 -107.80895
## 4 MA Nantucket 41.28346 -70.09946
## 5 OR Yamhill 45.34150 -123.18733
## 6 MO Ohio River 36.98672 -89.13062
## 7 NY Bellport 40.75704 -72.93927
## 8 NY Sleepy Hollow 41.08565 -73.85847
## 9 MA Sandwich 41.75899 -70.49392
## 10 LA New Orleans 29.95465 -90.07507
## 11 NJ Clifton 40.85843 -74.16375
## 12 MD Baltimore 39.29039 -76.61219
## 13 NE Nebraska 41.50028 -99.75067
## 14 WI Fish Creek 45.12777 -87.24705
## 15 NY Elmira 42.08980 -76.80773
## 16 CT Danbury 41.39482 -73.45401
## 17 VT Vermont 44.00034 -72.74983
## 18 CA Napa 38.29714 -122.28553
## 19 NY Warwick 41.25648 -74.35988
## 20 OR Newport 44.63678 -124.05345
#load libraries needed for map plotting
library(ggplot2)
library(ggmap)
# load the USA states outlines
states <- map_data("state")
ggplot(data = states) +
geom_polygon(aes(x = long, y = lat, group = group)
, fill = "lightblue"
, color = "black") +
coord_fixed(1.3) + #adjusting scales for proper display
guides(fill=FALSE) + #turn off color legends
theme_nothing() + #turn off grid and axes
geom_point(data = df_response_slim
, aes(x = longitude, y = latitude)
, color = "red"
, size = 2) #add api geo points
This project demonstrated one method of acquiring web API data and a rather simple way to load it into an R dataframe.
Lastly, since the data acquired was geographic data, the information was plotted on a map of the USA.