library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
border_data <- read.csv("Border_Crossing_Entry_Data.csv")
dim(border_data)
## [1] 404543 10
head(border_data)
## Port.Name State Port.Code Border Date
## 1 Jackman Maine 104 US-Canada Border Jan 2024
## 2 Porthill Idaho 3308 US-Canada Border Apr 2024
## 3 San Luis Arizona 2608 US-Mexico Border Apr 2024
## 4 Willow Creek Montana 3325 US-Canada Border Jan 2024
## 5 Warroad Minnesota 3423 US-Canada Border Jan 2024
## 6 Whitlash Montana 3321 US-Canada Border Jan 2024
## Measure Value Latitude Longitude
## 1 Trucks 6556 45.806 -70.397
## 2 Trucks 98 49.000 -116.499
## 3 Buses 10 32.485 -114.782
## 4 Pedestrians 2 49.000 -109.731
## 5 Personal Vehicle Passengers 9266 48.999 -95.377
## 6 Personal Vehicles 29 48.997 -111.258
## Point
## 1 POINT (-70.396722 45.805661)
## 2 POINT (-116.49925 48.999861)
## 3 POINT (-114.7822222 32.485)
## 4 POINT (-109.731333 48.999972)
## 5 POINT (-95.376555 48.999)
## 6 POINT (-111.257916 48.99725)
# Create a year column by taking the last 4 characters from the Date string column
border_data$Year <- as.numeric(substr(border_data$Date, 5, 8))
trucks_data <- border_data |>
filter(Year >= 2020 & Year <= 2024, Measure == "Trucks")
dim(trucks_data)
## [1] 5776 11
head(trucks_data)
## Port.Name State Port.Code Border Date Measure Value
## 1 Jackman Maine 104 US-Canada Border Jan 2024 Trucks 6556
## 2 Porthill Idaho 3308 US-Canada Border Apr 2024 Trucks 98
## 3 Warroad Minnesota 3423 US-Canada Border Jan 2024 Trucks 837
## 4 Wildhorse Montana 3323 US-Canada Border Jan 2024 Trucks 20
## 5 Fort Fairfield Maine 107 US-Canada Border Jan 2024 Trucks 525
## 6 Fortuna North Dakota 3417 US-Canada Border Jan 2024 Trucks 228
## Latitude Longitude Point Year
## 1 45.806 -70.397 POINT (-70.396722 45.805661) 2024
## 2 49.000 -116.499 POINT (-116.49925 48.999861) 2024
## 3 48.999 -95.377 POINT (-95.376555 48.999) 2024
## 4 48.999 -110.215 POINT (-110.215083 48.999361) 2024
## 5 46.765 -67.789 POINT (-67.789471 46.765323) 2024
## 6 49.000 -103.809 POINT (-103.80925 48.999555) 2024
monthly_trucks <- trucks_data |>
group_by(Border, Date) |>
summarise(total_trucks = sum(Value))
## `summarise()` has grouped output by 'Border'. You can override using the
## `.groups` argument.
monthly_trucks
## # A tibble: 120 × 3
## # Groups: Border [2]
## Border Date total_trucks
## <chr> <chr> <int>
## 1 US-Canada Border Apr 2020 316002
## 2 US-Canada Border Apr 2021 456864
## 3 US-Canada Border Apr 2022 459328
## 4 US-Canada Border Apr 2023 433683
## 5 US-Canada Border Apr 2024 483095
## 6 US-Canada Border Aug 2020 465088
## 7 US-Canada Border Aug 2021 482579
## 8 US-Canada Border Aug 2022 494726
## 9 US-Canada Border Aug 2023 457659
## 10 US-Canada Border Aug 2024 466822
## # ℹ 110 more rows
# Calculate summary statistics of monthly truck counts by border
monthly_summary <- monthly_trucks |>
group_by(Border) |>
summarise(
mean_trucks = mean(total_trucks),
median_trucks = median(total_trucks),
sd_trucks = sd(total_trucks),
min_trucks = min(total_trucks),
max_trucks = max(total_trucks)
)
monthly_summary
## # A tibble: 2 × 6
## Border mean_trucks median_trucks sd_trucks min_trucks max_trucks
## <chr> <dbl> <dbl> <dbl> <int> <int>
## 1 US-Canada Border 455899. 461550. 35235. 316002 508440
## 2 US-Mexico Border 592466. 596331 53019. 402091 677862
library(ggplot2)
# Boxplot of monthly truck totals by border
ggplot(monthly_trucks, aes(x = Border, y = total_trucks, fill = Border)) +
geom_boxplot() +
labs(title = "Distribution of Monthly Truck Crossings (2020-2024)",
x = "Border Region", y = "Monthly Truck Crossings") +
theme_minimal() +
guides(fill = "none")
library(ggplot2)
# Calculate mean monthly trucks by each border
border_means <- monthly_trucks |>
group_by(Border) |>
summarise(mean_trucks = mean(total_trucks))
ggplot(border_means, aes(x = Border, y = mean_trucks, fill = Border)) +
geom_col(color = "black") +
labs(title = "Average Monthly Truck Crossings (2020–2024)",
x = "Border Region",
y = "Average Monthly Trucks") +
theme_minimal() +
guides(fill = "none")
We will define the population mean monthly truck crossings for the northern border as \(\mu_N\) and for the southern border as \(\mu_S\). The hypotheses are:
We will assume a significance level of \(\alpha\) = 0.05.
We will now conduct the t-test:
t_test_result <- t.test(total_trucks ~ Border, data = monthly_trucks)
t_test_result
##
## Welch Two Sample t-test
##
## data: total_trucks by Border
## t = -16.617, df = 102.61, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group US-Canada Border and group US-Mexico Border is not equal to 0
## 95 percent confidence interval:
## -152866.6 -120266.6
## sample estimates:
## mean in group US-Canada Border mean in group US-Mexico Border
## 455899.2 592465.8
With a resulting p-value of less than 2.2e-16, and a threshold of \(\alpha\) = 0.05, we have strong evidence to reject the null hypothesis and support the alternative that there is a difference in mean inbound U.S. truck traffic between the northern and southern borders.
Dataset: U.S. Bureau of Transportation Statistics. (n.d.). Border crossing/entry data. U.S. Department of Transportation. Retrieved from https://www.bts.gov/content/border-crossingentry-data - (direct link: https://catalog.data.gov/dataset/border-crossing-entry-data-683ae)