faf_selected_subset <- read_csv("data/faf_selected_subset.csv")A5 - Interactive Geospatial
Data Munging
- The non-numeric data columns are of the form
- . First, remove the - part of the data
faf_selected_subset$dms_orig <- gsub("^\\d+-", "", faf_selected_subset$dms_orig)
faf_selected_subset$dms_dest <- gsub("^\\d+-", "", faf_selected_subset$dms_dest)
faf_selected_subset$sctg2 <- gsub("^\\d+-", "", faf_selected_subset$sctg2)
faf_selected_subset$dms_mode <- gsub("^\\d+-", "", faf_selected_subset$dms_mode)
data_preview <- as.data.frame(head(faf_selected_subset))
datatable(data_preview)- The state information is embedded in the dms-orig and dms-dest columns. This needs to be extracted.
- For most, the 2-letter abbreviations are available for extraction
- For some rows, there are multiple states involved in a region. For example, 532-Portland OR-WA (WA Part). In these instances, you need to extract the state from the part in parenthesis. In this instance, you’d extract WA
- Some rows have the full state names. These need to be converted to 2-letter state abbreviations
extract_state <- function(location) {
# Extract state abbreviation from parentheses
paren_state <- regmatches(location, regexpr("\\((\\w+) Part\\)", location))
if (length(paren_state) > 0 && nchar(paren_state) > 0) {
return(sub("\\((\\w+) Part\\)", "\\1", paren_state))
}
# Extract standard 2-letter state abbreviations
state_abbr <- regmatches(location, regexpr("\\b[A-Z]{2}\\b", location))
if (length(state_abbr) > 0 && nchar(state_abbr) > 0) {
return(state_abbr)
}
}
# Function to convert full state names to abbreviations
convert_state_name_to_abbr <- function(full_name) {
abbr <- state_abbreviations[full_name]
if (!is.null(abbr)) {
return(abbr)
} else {
return(NA)
}
}
faf_selected_subset$dms_orig_state <- sapply(faf_selected_subset$dms_orig, extract_state)
faf_selected_subset$dms_dest_state <- sapply(faf_selected_subset$dms_dest, extract_state)
full_state_names <- faf_selected_subset$full_state_column
state_abbrs <- sapply(full_state_names, convert_state_name_to_abbr)
data_preview <- as.data.frame(head(faf_selected_subset))
datatable(data_preview)Map 1: Commodity Choropleths
# Selecting the 3 commodities and totalling them
commodity_data <- faf_selected_subset %>%
filter(sctg2 %in% c("Motorized vehicles", "Alcoholic beverages", "Electronics")) %>%
mutate(dms_dest_state = as.character(dms_dest_state),
commodity = as.character(sctg2),
value = `million dollars in 2017`) %>%
select(dms_dest_state, commodity, value) %>%
group_by(dms_dest_state, commodity) %>%
summarise(total_value = sum(value))
commodity_data_wide <- commodity_data %>%
pivot_wider(names_from = commodity, values_from = total_value)
#write_csv(commodity_data_wide, "merged_data.csv")Map 2: Largest Import by State
# Getting the largest tonnage for each state and its corresponding commodity
largest_import <- faf_selected_subset %>%
mutate(dms_dest_state = as.character(dms_dest_state),
commodity = as.character(sctg2),
tonnage = `thousand tons in 2017`) %>%
select(dms_dest_state, commodity, tonnage) %>%
group_by(dms_dest_state, commodity) %>%
summarise(total_tonnage = sum(tonnage)) %>%
group_by(dms_dest_state) %>%
slice_max(order_by = total_tonnage, n = 1)
# Getting the full names of the states
states_sf <- st_read("data/tl_2020_us_state/tl_2020_us_state.shp")Reading layer `tl_2020_us_state' from data source
`/Users/parsakeyvani/Desktop/Adv Data viz/Assignments /spring2024-a5-interactive-geospatial-keyvanip/data/tl_2020_us_state/tl_2020_us_state.shp'
using driver `ESRI Shapefile'
Simple feature collection with 56 features and 14 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -179.2311 ymin: -14.60181 xmax: 179.8597 ymax: 71.43979
Geodetic CRS: GCS_North_American_1983
states_sf <- as.data.frame(states_sf)
states <-states_sf %>% select(NAME, STUSPS)
largest_import <- merge(x= largest_import, y= states, by.x = "dms_dest_state",
by.y = "STUSPS")
#write_csv(largest_import, "largest_import.csv")Map 3: Commodity Connection Map
# Getting the largest tonnage for each state and its corresponding commodity
connection_map <- faf_selected_subset %>%
mutate(dms_dest_state = as.character(dms_dest_state),
dms_orig_state = as.character(dms_orig_state),
commodity = as.character(sctg2),
value = `million dollars in 2017`) %>%
filter(commodity == "Alcoholic beverages",
dms_dest_state == "VA", dms_mode == "Truck") %>%
group_by(dms_orig_state, dms_dest_state) %>%
summarise(total_value = sum(value))
connection_map_subset <- connection_map %>%
arrange(desc(total_value)) %>%
head(6) %>%
filter(dms_orig_state != "VA")
connection_map <- merge(x= connection_map, y= states, by.x = "dms_orig_state",
by.y = "STUSPS")
write_csv(connection_map, "connection_map.csv")