library(tidyverse)

Load the datasets

col_types_spec <- cols_only(
  id = col_integer(),
  voyage_id = col_integer(),
  voyage_dates__imp_arrival_at_port_of_dis_sparsedate__year = col_double(),
  voyage_slaves_numbers__imp_total_num_slaves_disembarked = col_double(),
  voyage_slaves_numbers__imp_total_num_slaves_embarked = col_double(),
  voyage_dates__length_middle_passage_days = col_double(),
  voyage_dates__imp_length_home_to_disembark = col_double(),
  voyage_crew__crew_first_landing = col_double(),
  voyage_crew__crew_voyage_outset = col_double(),
  voyage_ship__tonnage_mod = col_double(),
  voyage_slaves_numbers__imp_jamaican_cash_price = col_double(),
  voyage_slaves_numbers__imp_mortality_ratio = col_double(),
  voyage_slaves_numbers__percentage_women_among_embarked_slaves = col_double(),
  voyage_outcome__vessel_captured_outcome__name = col_character(),
  voyage_ship__imputed_nationality__name = col_character(),
  voyage_itinerary__imp_region_voyage_begin__name = col_character(),
  voyage_ship__rig_of_vessel__name = col_character(),
  voyage_itinerary__place_voyage_ended__name = col_character(),
  voyage_dates__slave_purchase_began_sparsedate__month = col_double(),
  voyage_slaves_numbers__percentage_men = col_double(),
  voyage_dates__voyage_completed_sparsedate__month = col_double(),
  voyage_itinerary__region_of_return__name = col_character(),
  voyage_slaves_numbers__percentage_boy = col_double(),
  voyage_itinerary__imp_principal_region_slave_dis__name = col_character(),
  voyage_itinerary__imp_principal_region_of_slave_purchase__name = col_character(),
  voyage_dates__date_departed_africa_sparsedate__month = col_double(),
  voyage_dates__voyage_began_sparsedate__month = col_double(),
  voyage_itinerary__imp_port_voyage_begin__name = col_character(),
  voyage_dates__first_dis_of_slaves_sparsedate__month = col_double(),
  voyage_itinerary__imp_broad_region_slave_dis__name = col_character(),
  voyage_slaves_numbers__percentage_girl = col_double(),
  voyage_outcome__particular_outcome__name = col_character(),
  voyage_itinerary__imp_principal_port_slave_dis__name = col_character(),
  voyage_slaves_numbers__percentage_child = col_double(),
  voyage_slaves_numbers__percentage_women = col_double(),
  voyage_dates__departure_last_place_of_landing_sparsedate__month = col_double(),
  voyage_outcome__outcome_owner__name = col_character(),
  voyage_outcome__outcome_slaves__name = col_character(),
  voyage_itinerary__imp_principal_place_of_slave_purchase__name = col_character(),
  voyage_outcome__resistance__name = col_character(),
  voyage_slaves_numbers__percentage_male = col_double(),
  voyage_slaves_numbers__percentage_female = col_double(),
  voyage_itinerary__imp_broad_region_voyage_begin__name = col_character(),
  voyage_itinerary__imp_broad_region_of_slave_purchase__name = col_character(),
  voyage_sources = col_character(),
  enslavers = col_character()
)

trans <- read_csv(
  "https://raw.githubusercontent.com/imowerman-prog/data-3210/refs/heads/main/Data/trans-atlantic.csv",
  col_types = col_types_spec
)

intra <- read_csv(
  "https://raw.githubusercontent.com/imowerman-prog/data-3210/refs/heads/main/Data/intra-american.csv",
  col_types = col_types_spec
)

Clean the data:

1. Rename long Columns for Readability

trans_renamed <- trans %>%
  rename(
    year = voyage_dates__imp_arrival_at_port_of_dis_sparsedate__year,
    slaves_embarked = voyage_slaves_numbers__imp_total_num_slaves_embarked,
    slaves_disembarked = voyage_slaves_numbers__imp_total_num_slaves_disembarked,
    outcome_slaves = voyage_outcome__outcome_slaves__name,
    outcome_particular = voyage_outcome__particular_outcome__name,
    dis_broad = voyage_itinerary__imp_broad_region_slave_dis__name,
    dis_region = voyage_itinerary__imp_principal_region_slave_dis__name,
    dis_port = voyage_itinerary__imp_principal_port_slave_dis__name,
    country = voyage_ship__imputed_nationality__name
  )


intra_renamed <- intra %>%
  rename(
    year = voyage_dates__imp_arrival_at_port_of_dis_sparsedate__year,
    slaves_embarked = voyage_slaves_numbers__imp_total_num_slaves_embarked,
    slaves_disembarked = voyage_slaves_numbers__imp_total_num_slaves_disembarked,
    outcome_slaves = voyage_outcome__outcome_slaves__name,
    outcome_particular = voyage_outcome__particular_outcome__name,
    dis_broad = voyage_itinerary__imp_broad_region_slave_dis__name,
    dis_region = voyage_itinerary__imp_principal_region_slave_dis__name,
    dis_port = voyage_itinerary__imp_principal_port_slave_dis__name,
    country = voyage_ship__imputed_nationality__name
  )

2. Convert year to integer, slave numbers to numeric

trans_clean <- trans_renamed %>%
  mutate(
    year = as.integer(year),
    slaves_embarked = as.numeric(slaves_embarked),
    slaves_disembarked = as.numeric(slaves_disembarked)
  )

intra_clean <- intra_renamed %>%
  mutate(
    year = as.integer(year),
    slaves_embarked = as.numeric(slaves_embarked),
    slaves_disembarked = as.numeric(slaves_disembarked)
  )

3. Filter out rows where slaves_disembarked is 0 or NA (incomplete voyages)

trans_clean <- trans_clean %>%
  filter(!is.na(slaves_disembarked), slaves_disembarked > 0)

intra_clean <- intra_clean %>%
  filter(!is.na(slaves_disembarked), slaves_disembarked > 0)

4. Filter for successful outcomes (e.g., “Slaves disembarked”, “Voyage completed”, “Sold slaves”)

success_values <- c("Slaves disembarked", "Voyage completed", "Sold slaves")

trans_clean <- trans_renamed %>%
  mutate(
    year = as.integer(year),
    slaves_embarked = as.numeric(slaves_embarked),
    slaves_disembarked = as.numeric(slaves_disembarked)
  ) %>%
  filter(!is.na(slaves_disembarked), slaves_disembarked > 0)

intra_clean <- intra_renamed %>%
  mutate(
    year = as.integer(year),
    slaves_embarked = as.numeric(slaves_embarked),
    slaves_disembarked = as.numeric(slaves_disembarked)
  ) %>%
  filter(!is.na(slaves_disembarked), slaves_disembarked > 0)

5. Add new columns: decade (e.g., floor(year / 10) * 10), estimated_deaths (slaves_embarked - slaves_disembarked), is_us (TRUE if disembark is US-based, using dis_broad == “Mainland North America” or specific US regions/ports like “New Orleans”).

trans_final <- trans_clean %>%
  mutate(
    decade = floor(year / 10) * 10,
    estimated_deaths = slaves_embarked - slaves_disembarked,
    is_us = dis_broad %in% c("Mainland North America", "North America") |
      dis_region %in% c("New York", "Massachusetts", "Virginia", "Louisiana",
                        "South Carolina", "North Carolina", "Georgia",
                        "Florida", "Maryland", "Texas") |
      dis_port %in% c("New Orleans", "New York", "Boston", "Charleston",
                      "Savannah", "Norfolk", "Mobile", "Annapolis",
                      "Baltimore", "Newport"),
    source_type = "Trans-Atlantic"
  )

intra_final <- intra_clean %>%
  mutate(
    decade = floor(year / 10) * 10,
    estimated_deaths = slaves_embarked - slaves_disembarked,
    is_us = dis_broad %in% c("Mainland North America", "North America") |
      dis_region %in% c("New York", "Massachusetts", "Virginia", "Louisiana",
                        "South Carolina", "North Carolina", "Georgia",
                        "Florida", "Maryland", "Texas") |
      dis_port %in% c("New Orleans", "New York", "Boston", "Charleston",
                      "Savannah", "Norfolk", "Mobile", "Annapolis",
                      "Baltimore", "Newport"),
    source_type = "Intra-American"
  )

6. Combine the datasets with bind_rows(), adding a source_type column (“Trans-Atlantic” or “Intra-American”)

combined_data <- bind_rows(trans_final, intra_final)

Part 2: Analysis and Questions (5 points)

1. Total slaves imported to the US: Filter for is_us == TRUE, sum slaves_disembarked from both datasets.

us_total <- combined_data %>%
  filter(is_us == TRUE) %>%
  summarise(total_slaves_imported_to_us = sum(slaves_disembarked, na.rm = TRUE))

us_total
## # A tibble: 1 × 1
##   total_slaves_imported_to_us
##                         <dbl>
## 1                      439667

2. Proportion of all slaves taken from Africa: Calculate US total / total slaves_embarked from Trans-Atlantic dataset (as this represents slaves taken from Africa).

us_total_value <- combined_data %>%
  filter(is_us == TRUE) %>%
  summarise(total_slaves_imported_to_us = sum(slaves_disembarked, na.rm = TRUE)) %>%
  pull(total_slaves_imported_to_us)

africa_total_value <- trans_final %>%
  summarise(total_taken_from_africa = sum(slaves_embarked, na.rm = TRUE)) %>%
  pull(total_taken_from_africa)

proportion_africa_to_us <- tibble(
  us_total = us_total_value,
  total_taken_from_africa = africa_total_value,
  proportion = us_total_value / africa_total_value
)

proportion_africa_to_us
## # A tibble: 1 × 3
##   us_total total_taken_from_africa proportion
##      <dbl>                   <dbl>      <dbl>
## 1   439667                10575764     0.0416

3. Graph slave imports by decade to the US: Filter for US, group by decade, sum slaves_disembarked, plot as a bar graph with ggplot2.

us_by_decade <- combined_data %>%
  filter(is_us == TRUE) %>%
  group_by(decade) %>%
  summarise(total_imported = sum(slaves_disembarked, na.rm = TRUE), .groups = "drop")


ggplot(data = us_by_decade, aes(x = factor(decade), y = total_imported)) +
  geom_col() +
  labs(
    title = "Slave Imports to the United States by Decade",
    x = "Decade",
    y = "Total Slaves Disembarked"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

4. Imports to the US by decade and region/port/state: Filter for US, group by decade, dis_region, dis_port (approximate state from port/region, e.g., “New Orleans” -> “Louisiana”), sum slaves_disembarked. Use a table and faceted bar plot.

us_region_port_state <- combined_data %>%
  filter(is_us == TRUE, !is.na(decade)) %>%
  mutate(
    state_approx = case_when(
      str_detect(coalesce(dis_port, ""), regex("new orleans", ignore_case = TRUE)) ~ "Louisiana",
      str_detect(coalesce(dis_port, ""), regex("new york", ignore_case = TRUE)) ~ "New York",
      str_detect(coalesce(dis_port, ""), regex("boston", ignore_case = TRUE)) ~ "Massachusetts",
      str_detect(coalesce(dis_port, ""), regex("charleston", ignore_case = TRUE)) ~ "South Carolina",
      str_detect(coalesce(dis_port, ""), regex("savannah", ignore_case = TRUE)) ~ "Georgia",
      str_detect(coalesce(dis_port, ""), regex("norfolk", ignore_case = TRUE)) ~ "Virginia",
      str_detect(coalesce(dis_port, ""), regex("mobile", ignore_case = TRUE)) ~ "Alabama",
      str_detect(coalesce(dis_port, ""), regex("annapolis", ignore_case = TRUE)) ~ "Maryland",
      str_detect(coalesce(dis_port, ""), regex("baltimore", ignore_case = TRUE)) ~ "Maryland",
      str_detect(coalesce(dis_port, ""), regex("newport", ignore_case = TRUE)) ~ "Rhode Island",
      str_detect(coalesce(dis_region, ""), regex("louisiana", ignore_case = TRUE)) ~ "Louisiana",
      str_detect(coalesce(dis_region, ""), regex("new york", ignore_case = TRUE)) ~ "New York",
      str_detect(coalesce(dis_region, ""), regex("massachusetts", ignore_case = TRUE)) ~ "Massachusetts",
      str_detect(coalesce(dis_region, ""), regex("south carolina", ignore_case = TRUE)) ~ "South Carolina",
      str_detect(coalesce(dis_region, ""), regex("north carolina", ignore_case = TRUE)) ~ "North Carolina",
      str_detect(coalesce(dis_region, ""), regex("georgia", ignore_case = TRUE)) ~ "Georgia",
      str_detect(coalesce(dis_region, ""), regex("virginia", ignore_case = TRUE)) ~ "Virginia",
      str_detect(coalesce(dis_region, ""), regex("florida", ignore_case = TRUE)) ~ "Florida",
      str_detect(coalesce(dis_region, ""), regex("maryland", ignore_case = TRUE)) ~ "Maryland",
      str_detect(coalesce(dis_region, ""), regex("texas", ignore_case = TRUE)) ~ "Texas",
      TRUE ~ "Other/Unknown"
    )
  ) %>%
  group_by(decade, dis_region, dis_port, state_approx) %>%
  summarise(total_imported = sum(slaves_disembarked, na.rm = TRUE), .groups = "drop") %>%
  arrange(decade, desc(total_imported))

head(us_region_port_state, 25)
## # A tibble: 25 × 5
##    decade dis_region    dis_port                   state_approx  total_imported
##     <dbl> <chr>         <chr>                      <chr>                  <dbl>
##  1   1610 Virginia      Hampton                    Virginia                  29
##  2   1620 Virginia      Virginia, port unspecified Virginia                   3
##  3   1630 New York      New York                   New York                  53
##  4   1630 Virginia      Virginia, port unspecified Virginia                  13
##  5   1630 Massachusetts Boston                     Massachusetts              7
##  6   1640 Virginia      Virginia, port unspecified Virginia                 435
##  7   1640 New York      New York                   New York                  69
##  8   1650 New York      New York                   New York                 477
##  9   1650 Virginia      Virginia, port unspecified Virginia                 469
## 10   1650 Maryland      Maryland, port unspecified Maryland                   5
## # ℹ 15 more rows

5. Countries participating in export from Africa, by decade: From Trans-Atlantic dataset, group by decade and voyage_ship__imputed_nationality__name (as “country”), count unique voyages or sum slaves_embarked. Display in a table.

countries_by_decade <- trans_final %>%
  filter(!is.na(decade), !is.na(country), country != "") %>%
  group_by(decade, country) %>%
  summarise(
    voyages = n_distinct(voyage_id),
    total_embarked = sum(slaves_embarked, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(decade, desc(total_embarked))

Part 3: Visualizations and Publication (2 Points)

1. Plot 1: U.S. Slave Imports by Decade

ggplot(us_by_decade, aes(x = factor(decade), y = total_imported)) +
  geom_col(fill = "steelblue") +
  labs(
    title = "Slave Imports to the United States by Decade",
    x = "Decade",
    y = "Total Slaves Disembarked"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

2. Plot 2: U.S. Slave Imports by State and Decade

ggplot(head(us_region_port_state, 20), aes(x = reorder(dis_port, total_imported), y = total_imported)) +
  geom_col() +
  coord_flip() +
  labs(
    title = "Top U.S. Ports by Slave Imports",
    x = "Port",
    y = "Total Slaves Disembarked"
  ) +
  theme_minimal()

3. Write a summary of what you have uncovered from this assignment.

In this assignment, I cleaned and analyzed data from both the Trans-Atlantic and Intra-American slave trade datasets using R and the tidyverse package. I renamed long variable names, converted key variables to numeric formats, filtered out incomplete voyages, and created new variables including decade, estimated deaths during voyages, and an indicator for whether enslaved people were disembarked in the United States. After cleaning the data, I combined the two datasets to analyze overall patterns.

The analysis showed how slave imports to the United States changed over time by decade. The results indicate that slave imports were concentrated in particular time periods and were associated with specific ports and regions. Ports such as New Orleans and other major coastal trading locations appeared frequently in the data, showing how important these areas were in the slave trade network.

The visualizations helped highlight how the number of enslaved people arriving in the United States varied across decades and locations. Additionally, examining the trans-Atlantic dataset revealed that several countries were involved in exporting enslaved people from Africa, demonstrating the international nature of the slave trade.

Overall, the analysis demonstrates the scale and geographic complexity of the slave trade. By cleaning and combining the datasets, it becomes easier to observe patterns in where enslaved people were transported, which ports were most active, and how participation by exporting countries changed over time.