OpenStreetMap: Is open data built by a community of mappers that contribute and maintain data about roads, trails, cafés, railway stations, and much more, all over the world.
We are going to use tidyverse a collection of R packages designed for data science. As well as other packages to help with data cleaning and processing.
The geocode function below allow us to make requests to Nominatim a search engine for OpenStreetMap data.
# INPUT LOCATION VARIABLES
# [RECORD_ID], [ADDRESS], [CITY], [STATE], [ZIPCODE]
geocode <- function(record_id, address, city, state, zipcode){
# NOMINATIM SEARCH API URL
src_url <- "https://nominatim.openstreetmap.org/search?q="
###### INPUTS PREPARATION #####
city <- str_replace_all(string = city,
pattern = "\\s|,",
replacement = "+")
# CREATE A FULL ADDRESS
addr <- paste(address, city, state, zipcode, sep = "%2C")
# CREATE A SEARCH URL BASED ON NOMINATIM API TO RETURN GEOJSON
requests <- paste0(src_url, addr, "&format=geojson")
for (i in 1:length(requests)) {
response <- read_html(requests[i]) %>%
html_node("p") %>%
html_text() %>%
fromJSON()
# FROM THE RESPONSE EXTRACT LATITUDE AND LONGITUDE COORDINATES
lon <- response$features$geometry$coordinates[[1]][1]
lat <- response$features$geometry$coordinates[[1]][2]
# CREATE A COORDINATES DATAFRAME
if(i == 1) {
loc <- tibble(record_id = record_id[i],
address = str_replace_all(addr[i], "%2C", ","),
latitude = lat, longitude = lon)
}else{
df <- tibble(record_id = record_id[i],
address = str_replace_all(addr[i], "%2C", ","),
latitude = lat, longitude = lon)
loc <- bind_rows(loc, df)
}
}
return(loc)
}
This data set pertains to all types of structural permits. Data includes details on application/permit numbers, job addresses, supervisorial districts, and the current status of the applications.
data <- read.csv("data/building_permits.csv", check.names=FALSE)
head(data)
tail(data)
colnames(data)
[1] "Permit Number"
[2] "Permit Type"
[3] "Permit Type Definition"
[4] "Permit Creation Date"
[5] "Block"
[6] "Lot"
[7] "Street Number"
[8] "Street Number Suffix"
[9] "Street Name"
[10] "Street Suffix"
[11] "Unit"
[12] "Unit Suffix"
[13] "Description"
[14] "Current Status"
[15] "Current Status Date"
[16] "Filed Date"
[17] "Issued Date"
[18] "Completed Date"
[19] "First Construction Document Date"
[20] "Structural Notification"
[21] "Number of Existing Stories"
[22] "Number of Proposed Stories"
[23] "Voluntary Soft-Story Retrofit"
[24] "Fire Only Permit"
[25] "Permit Expiration Date"
[26] "Estimated Cost"
[27] "Revised Cost"
[28] "Existing Use"
[29] "Existing Units"
[30] "Proposed Use"
[31] "Proposed Units"
[32] "Plansets"
[33] "TIDF Compliance"
[34] "Existing Construction Type"
[35] "Existing Construction Type Description"
[36] "Proposed Construction Type"
[37] "Proposed Construction Type Description"
[38] "Site Permit"
[39] "Supervisor District"
[40] "Neighborhoods - Analysis Boundaries"
[41] "Zipcode"
[42] "Location"
[43] "Record ID"
[44] "SF Find Neighborhoods"
[45] "Current Police Districts"
[46] "Current Supervisor Districts"
[47] "Analysis Neighborhoods"
[48] "DELETE - Zip Codes"
[49] "DELETE - Fire Prevention Districts"
[50] "DELETE - Supervisor Districts"
[51] "DELETE - Current Police Districts"
[52] "DELETE - Supervisorial_Districts_Waterline_data_from_7pkg_wer3"
length(colnames(data))
[1] 52
colremove = c("First.Construction.Document.Date",
"Structural.Notification",
"Number.of.Existing.Stories",
"Number.of.Proposed.Stories",
"Voluntary.Soft.Story.Retrofit",
"Fire.Only.Permit","Existing.Units",
"Proposed.Units","Plansets",
"TIDF.Compliance","Existing.Construction.Type",
"Proposed.Construction Type","Site.Permit",
"Supervisor.District","Current.Police.Districts",
"Current.Supervisor.Districts",
"Current.Status.Date", "Permit.Creation.Date",
"Analysis.Neighborhoods","Lot","Location",
"SF.Find.Neighborhoods","Unit","Block", "Permit.Type",
"Unit.Suffix","Street.Number.Suffix",
"Existing.Construction.Type.Description")
data <- data[1:47,!(names(data) %in% colremove)]
length(colnames(data))
[1] 19
colnames(data)
[1] "Permit Number"
[2] "Permit Type Definition"
[3] "Street Number"
[4] "Street Name"
[5] "Street Suffix"
[6] "Description"
[7] "Current Status"
[8] "Filed Date"
[9] "Issued Date"
[10] "Completed Date"
[11] "Permit Expiration Date"
[12] "Estimated Cost"
[13] "Revised Cost"
[14] "Existing Use"
[15] "Proposed Use"
[16] "Proposed Construction Type Description"
[17] "Neighborhoods - Analysis Boundaries"
[18] "Zipcode"
[19] "Record ID"
Now, let us rename two of the columns in the dataset:
data <- rename(data, "Neighborhoods Boundaries" = "Neighborhoods - Analysis Boundaries")
data <- rename(data, "Permit Type" = "Permit Type Definition")
colnames(data) <- str_replace_all(tolower(colnames(data)), " ","_")
Let us see the lowercase column datas:
colnames(data)
[1] "permit_number"
[2] "permit_type"
[3] "street_number"
[4] "street_name"
[5] "street_suffix"
[6] "description"
[7] "current_status"
[8] "filed_date"
[9] "issued_date"
[10] "completed_date"
[11] "permit_expiration_date"
[12] "estimated_cost"
[13] "revised_cost"
[14] "existing_use"
[15] "proposed_use"
[16] "proposed_construction_type_description"
[17] "neighborhoods_boundaries"
[18] "zipcode"
[19] "record_id"
dim_desc(data)
[1] "[1,048,575 x 19]"
bldg_df <- data %>% filter(current_status == "complete")
head(bldg_df)
NA
dim_desc(bldg_df)
[1] "[574,003 x 19]"
bldg_df <- na.omit(bldg_df)
dim_desc(bldg_df)
[1] "[343,237 x 19]"
bldg_df$city <- "San Francisco"
bldg_df$state <- "CA"
bldg_df$record_id <- as.character(bldg_df$record_id)
bldg_df$proposed_construction_type_description <-
bldg_df$proposed_construction_type_description %>%
str_remove_all(pattern = "\\(|\\)") %>%
str_replace_all(" ", "_") %>%
as_factor()
bldg_df$street_name <- str_remove(string = bldg_df$street_name, pattern = "^0+")
bldg_df$street_name <- str_replace_all(string = bldg_df$street_name,
pattern = "Bay Shore",
replacement = "Bayshore")
bldg_df$street_suffix <- str_replace_all(string = bldg_df$street_suffix,
pattern = "Bl",
replacement = "Blvd")
bldg_df$street_suffix <- str_replace_all(string = bldg_df$street_suffix,
pattern = "Tr",
replacement = "Ter")
bldg_df$changed_use <- bldg_df$existing_use != bldg_df$proposed_use
bldg_df$estimated_revised_diff <- bldg_df$revised_cost - bldg_df$estimated_cost
bldg_df$issued_date <- bldg_df$issued_date %>% mdy() %>% as_date()
bldg_df$issued_year <- bldg_df$issued_date %>% year()
bldg_df$issued_month <- bldg_df$issued_date %>% month(label = TRUE)
bldg_df$completed_date <- bldg_df$completed_date %>% mdy() %>% as_date()
bldg_df$completed_year <- bldg_df$completed_date %>% year()
bldg_df$completed_month <- bldg_df$completed_date %>% month(label = TRUE)
bldg_df$issued_completed_days <- bldg_df$completed_date - bldg_df$issued_date
bldg_df$issued_completed_years <- bldg_df$completed_year - bldg_df$issued_year
bldg_df$filed_date <- bldg_df$filed_date %>% mdy() %>% as_date()
bldg_df$filed_issued_days <- bldg_df$issued_date - bldg_df$filed_date
bldg_df$permit_expiration_date <- bldg_df$permit_expiration_date %>% mdy() %>% as_date()
bldg_df$issued_expiration_days <- bldg_df$permit_expiration_date - bldg_df$issued_date
To look deeper into changes in zoning and type of constructions. We want to select only project that changed the building use. Also we will like to look at developments with significant capital investment in particular greater than half a million dollars
bldg_df <- bldg_df %>% arrange(desc(estimated_cost))
bldg_df <- bldg_df[bldg_df$changed_use == TRUE,]
bldg_df <- bldg_df %>% filter(revised_cost > 500000)
bldg_df$zipcode <- as_factor(bldg_df$zipcode)
bldg_df$permit_type <- as_factor(bldg_df$permit_type)
bldg_df$neighborhoods_boundaries <- as_factor(bldg_df$neighborhoods_boundaries)
bldg_df$proposed_use <- as_factor(bldg_df$proposed_use)
head(bldg_df)
NA
dim_desc(bldg_df)
[1] "[2,407 x 31]"
# ADDRESS VARIABLE MUST MATH NOMINATIM API
address <- paste0(bldg_df$street_number," ",
bldg_df$street_name," ",
bldg_df$street_suffix)
# DATA CLEANING SPECIAL CASES (e.g. 3333-3339 N CLARK)
query <- stri_replace(str = address,
replacement = "",
regex = "(-[0-9]+\\s)")
# REPLACE SPACES (\\s) OR COMMAS (,) WITH + SIGN
query <- str_replace_all(string = query,
pattern = "\\s|,",
replacement = "+")
head(query)
[1] "250+Howard+St" "245+1st+St" "1351+3rd+St"
[4] "1245+3rd+St" "1251+3rd+St" "449+Mission+Rock+St"
df <- geocode(record_id = bldg_df$record_id,
address = query,
city = bldg_df$city,
state = bldg_df$state,
zipcode = bldg_df$zipcode)
Create a main dataset with all the relevant information
Finally lets save the new dataset containing the restaurants Geolocation (latitude, longitude).
write_csv(bldg_df, "data/building_geocoord.csv")