The following document was arranged using a data frame uploaded to kaggle.com by the U.S. Department of Statistics and provides some insights regarding economic factors that this environment hosts. In the data frame are 8 variables: Date, Border, Measure, Value, Location, State, Port Name, and Port Code. The data was collected over a period beginning in 1996 and ending in 2019. The values that are reported are averaged over the entire month, and are reported on the first day of each month, of each year.
##Load up the packages
library(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(purrr)
library(modelr)
library(leaflet)
library(htmltools)
library(ggplot2)
library(gganimate)
library(shiny)
library(shinydashboard)
Now, I’m going to read in the csv file and manipulate the data frame to the desired format.
border <- read.csv("Border_crossing_Entry_data.csv")
sb <- border %>% filter(Border == "US-Mexico Border") %>%
droplevels.data.frame()
sb.1 <- sb
sb$Location <- gsub("POINT ", "", sb$Location)
sb$Location <- gsub("\\(", "", sb$Location)
sb$Location <- gsub(")", "", sb$Location)
sb <- sb %>% separate(Location, into = c("long", "lat"), sep = " ")
sb$lat <- as.numeric(sb$lat)
sb$long <- as.numeric(sb$long)
sb$Date <- as.Date.character(sb$Date, format = "%m/%d/%Y")
sb <- sb %>% mutate(Year = year(Date), Month = month(Date, label = T, abbr = T))
Now I’d like to visualize the data to see if anything intersting is happening.
sb %>%
group_by(Year, Value, State, Measure) %>%
summarize(
mean = mean(Value),
n = n()
) %>%
ggplot(aes(Measure, mean, fill = State)) +
geom_col() + coord_flip() + facet_wrap(~State) +
transition_states(Year, transition_length = 1, state_length = 1) +
ggtitle("Average Count of Each Measure by Year" ,
subtitle = "By. Trevor Jackson \nYear: {closest_state}" ) +
ylab("Average Value of Each Measure") +
xlab("")
sb %>%
group_by(Measure, State, Year) %>%
summarize(
mean = mean(Value),
Count = n()
) %>% ggplot(aes(Year, mean, color = Measure)) +
geom_line(alpha = 1 / 3) + facet_wrap(~State) +
transition_reveal(Year) +
ggtitle("Average Value of Each Measure",
subtitle = "By: Trevor Jackson") +
ylab("Average Values of Each Measure") +
xlab("")
And we see that ‘Pedestrian’, ‘Personal Vehicles’, and ‘Personal Vehicle Passengers’ represent the majority of the documented objects of surveilance crossing the southern border.
We should note that the drug war in Mexico was launched under president Felipe Calderon during the year 2006. The plot demonstrates a significant decrease in almost all variables around that time, indicating the economic impact that the drug war might have inflicted. Surprisingly, New Mexico actually experienced more interaction around that time. We see things begin to hike back up some years following the initial declaration of the drug war.
Let’s look at the correlations of each measure, and build a function that allows us to see them by year.
sb_cor <- sb %>% group_by(Year) %>%
nest()
cor_fun <- function(data) {
x <- data %>%
select(State, Month, Port.Code, Measure, Value) %>%
spread(Measure, Value)
cor(x[, 4:15], use = "pairwise")
}
sb_cor <- sb_cor %>%
mutate(cor = map(data, cor_fun))
sb_cor_1 <- sb_cor
cor_year <- function(year){
sb_cor %>%
filter(Year == year) %>% ##Allowing to see by year
select(cor) %>%
with(cor, table)
}
cor_year(2018L)
## Adding missing grouping variables: `Year`
## [[1]]
## Bus Passengers Buses Pedestrians
## Bus Passengers 1.0000000 0.620603152 0.253965757
## Buses 0.6206032 1.000000000 0.697796801
## Pedestrians 0.2539658 0.697796801 1.000000000
## Personal Vehicle Passengers 0.2538796 0.719775851 0.944533630
## Personal Vehicles 0.2050601 0.701385896 0.945196880
## Rail Containers Empty 0.4190228 0.145427942 -0.095505028
## Rail Containers Full 0.6480037 0.713092791 -0.007429902
## Train Passengers 0.7225843 0.001682223 0.886530807
## Trains 0.5914053 0.337412333 -0.019026774
## Truck Containers Empty 0.8153419 0.880664860 0.584690648
## Truck Containers Full 0.8129384 0.928718058 0.600517106
## Trucks 0.8192711 0.921458753 0.600547941
## Personal Vehicle Passengers Personal Vehicles
## Bus Passengers 0.25387964 0.2050601
## Buses 0.71977585 0.7013859
## Pedestrians 0.94453363 0.9451969
## Personal Vehicle Passengers 1.00000000 0.9964423
## Personal Vehicles 0.99644225 1.0000000
## Rail Containers Empty -0.18160214 -0.2528455
## Rail Containers Full -0.08255015 -0.1208785
## Train Passengers 0.76136670 0.7226677
## Trains -0.10981169 -0.1745198
## Truck Containers Empty 0.62545247 0.5951599
## Truck Containers Full 0.61737743 0.5919797
## Trucks 0.62568790 0.5992163
## Rail Containers Empty Rail Containers Full
## Bus Passengers 0.41902281 0.648003715
## Buses 0.14542794 0.713092791
## Pedestrians -0.09550503 -0.007429902
## Personal Vehicle Passengers -0.18160214 -0.082550149
## Personal Vehicles -0.25284553 -0.120878539
## Rail Containers Empty 1.00000000 0.878913887
## Rail Containers Full 0.87891389 1.000000000
## Train Passengers 0.92385636 0.927552395
## Trains 0.95789280 0.975192373
## Truck Containers Empty 0.30084309 0.671184371
## Truck Containers Full 0.34794889 0.772319615
## Trucks 0.33846467 0.751637133
## Train Passengers Trains Truck Containers Empty
## Bus Passengers 0.722584295 0.59140525 0.8153419
## Buses 0.001682223 0.33741233 0.8806649
## Pedestrians 0.886530807 -0.01902677 0.5846906
## Personal Vehicle Passengers 0.761366700 -0.10981169 0.6254525
## Personal Vehicles 0.722667654 -0.17451978 0.5951599
## Rail Containers Empty 0.923856357 0.95789280 0.3008431
## Rail Containers Full 0.927552395 0.97519237 0.6711844
## Train Passengers 1.000000000 0.99592397 0.1673107
## Trains 0.995923969 1.00000000 0.5092964
## Truck Containers Empty 0.167310663 0.50929638 1.0000000
## Truck Containers Full 0.181428995 0.54924902 0.9647184
## Trucks 0.190989968 0.54404205 0.9798877
## Truck Containers Full Trucks
## Bus Passengers 0.8129384 0.8192711
## Buses 0.9287181 0.9214588
## Pedestrians 0.6005171 0.6005479
## Personal Vehicle Passengers 0.6173774 0.6256879
## Personal Vehicles 0.5919797 0.5992163
## Rail Containers Empty 0.3479489 0.3384647
## Rail Containers Full 0.7723196 0.7516371
## Train Passengers 0.1814290 0.1909900
## Trains 0.5492490 0.5440421
## Truck Containers Empty 0.9647184 0.9798877
## Truck Containers Full 1.0000000 0.9974681
## Trucks 0.9974681 1.0000000
cor_year(2017L)
## Adding missing grouping variables: `Year`
## [[1]]
## Bus Passengers Buses Pedestrians
## Bus Passengers 1.0000000 0.67396510 0.27378864
## Buses 0.6739651 1.00000000 0.68143665
## Pedestrians 0.2737886 0.68143665 1.00000000
## Personal Vehicle Passengers 0.3187157 0.72294730 0.94077031
## Personal Vehicles 0.2522895 0.70234425 0.93792885
## Rail Containers Empty 0.4759153 0.14251677 -0.13548276
## Rail Containers Full 0.6571132 0.61771799 -0.07623398
## Train Passengers 0.6088593 -0.01085739 0.86120750
## Trains 0.6417006 0.35380771 -0.04050801
## Truck Containers Empty 0.8430839 0.81872057 0.62396240
## Truck Containers Full 0.8833095 0.90638364 0.55267717
## Trucks 0.8907422 0.91248028 0.56593572
## Personal Vehicle Passengers Personal Vehicles
## Bus Passengers 0.3187157 0.2522895
## Buses 0.7229473 0.7023443
## Pedestrians 0.9407703 0.9379288
## Personal Vehicle Passengers 1.0000000 0.9913703
## Personal Vehicles 0.9913703 1.0000000
## Rail Containers Empty -0.2183414 -0.2962551
## Rail Containers Full -0.1239672 -0.1657860
## Train Passengers 0.7808106 0.7128727
## Trains -0.1040700 -0.1715227
## Truck Containers Empty 0.7173905 0.6810568
## Truck Containers Full 0.6086427 0.5657969
## Trucks 0.6339776 0.5939626
## Rail Containers Empty Rail Containers Full
## Bus Passengers 0.4759153 0.65711324
## Buses 0.1425168 0.61771799
## Pedestrians -0.1354828 -0.07623398
## Personal Vehicle Passengers -0.2183414 -0.12396717
## Personal Vehicles -0.2962551 -0.16578599
## Rail Containers Empty 1.0000000 0.91357099
## Rail Containers Full 0.9135710 1.00000000
## Train Passengers 0.9070672 0.87384223
## Trains 0.9504146 0.98598200
## Truck Containers Empty 0.2922476 0.52622156
## Truck Containers Full 0.3772152 0.70504349
## Trucks 0.3757660 0.69912020
## Train Passengers Trains Truck Containers Empty
## Bus Passengers 0.60885932 0.64170062 0.8430839
## Buses -0.01085739 0.35380771 0.8187206
## Pedestrians 0.86120750 -0.04050801 0.6239624
## Personal Vehicle Passengers 0.78081064 -0.10406997 0.7173905
## Personal Vehicles 0.71287275 -0.17152271 0.6810568
## Rail Containers Empty 0.90706723 0.95041455 0.2922476
## Rail Containers Full 0.87384223 0.98598200 0.5262216
## Train Passengers 1.00000000 0.93804888 0.6006285
## Trains 0.93804888 1.00000000 0.4993533
## Truck Containers Empty 0.60062849 0.49935332 1.0000000
## Truck Containers Full 0.16938109 0.58311206 0.9271747
## Trucks 0.22862879 0.58805892 0.9394696
## Truck Containers Full Trucks
## Bus Passengers 0.8833095 0.8907422
## Buses 0.9063836 0.9124803
## Pedestrians 0.5526772 0.5659357
## Personal Vehicle Passengers 0.6086427 0.6339776
## Personal Vehicles 0.5657969 0.5939626
## Rail Containers Empty 0.3772152 0.3757660
## Rail Containers Full 0.7050435 0.6991202
## Train Passengers 0.1693811 0.2286288
## Trains 0.5831121 0.5880589
## Truck Containers Empty 0.9271747 0.9394696
## Truck Containers Full 1.0000000 0.9829728
## Trucks 0.9829728 1.0000000
We can see some of the variables are very highly correlated.
We can build a simple model for each year.
sb_nest <- sb %>% group_by(Year, Month, Port.Code, State, Measure, Value) %>%
spread(Measure, Value) %>% group_by(Year) %>% nest()
reg_ped <- function(df) {
lm(formula = `Pedestrians` ~ `Personal Vehicle Passengers` +
Month + State,
data = df)
}
sb_nest <- sb_nest %>% mutate(Ped.Model = map(data, reg_ped))
Now we can look at some of the residuals.
sb_nest <- sb_nest %>% mutate(Residuals = map2(data, Ped.Model, add_residuals))
Ped_resids <- unnest(data = sb_nest, Residuals)
Ped_resids %>% ggplot(aes(Year, resid, group = `Pedestrians`)) +
geom_line() + facet_wrap(~State) +
ggtitle("Plot of Residuals", subtitle = "By: Trevor Jackson") +
xlab("Year") + ylab("Residuals for the Linear Regression Model")
Ped_resids %>% ggplot(aes(Year, resid)) +
geom_jitter(alpha = 1/3) + facet_wrap(~State) +
ggtitle("Plot of Residuals", subtitle = "By: Trevor Jackson") +
xlab("Year") + ylab("Residuals for the Linear Regression Model")
The model doesn’t look too bad. It definitely does better in some states than others.
glance <- sb_nest %>% mutate(glance = map(Ped.Model, broom::glance)) %>%
unnest(glance)
glance %>% arrange(r.squared)
## # A tibble: 24 x 15
## # Groups: Year [24]
## Year data Ped.Model Residuals r.squared adj.r.squared sigma statistic
## <dbl> <list<df[> <list> <list> <dbl> <dbl> <dbl> <dbl>
## 1 1996 [300 × 20] <lm> <tibble … 0.591 0.570 1.11e5 27.4
## 2 1997 [300 × 20] <lm> <tibble … 0.602 0.581 1.37e5 28.6
## 3 1999 [300 × 20] <lm> <tibble … 0.622 0.602 1.36e5 31.2
## 4 2000 [300 × 20] <lm> <tibble … 0.643 0.624 1.30e5 34.1
## 5 1998 [300 × 20] <lm> <tibble … 0.656 0.638 1.23e5 36.1
## 6 2002 [300 × 20] <lm> <tibble … 0.694 0.678 1.30e5 43.0
## 7 2006 [300 × 20] <lm> <tibble … 0.713 0.698 1.16e5 47.0
## 8 2007 [300 × 20] <lm> <tibble … 0.723 0.708 1.21e5 49.4
## 9 2003 [300 × 20] <lm> <tibble … 0.723 0.709 1.21e5 49.5
## 10 2001 [300 × 20] <lm> <tibble … 0.735 0.721 1.32e5 52.4
## # … with 14 more rows, and 7 more variables: p.value <dbl>, df <int>,
## # logLik <dbl>, AIC <dbl>, BIC <dbl>, deviance <dbl>, df.residual <int>
glance %>% ggplot(aes(Year, r.squared)) + geom_jitter()
We can see that it definitely improves with time. Let’s see what kind of prediction it can make.
df <- tribble(
~Port.Name, ~State, ~Port.Code, ~Year, ~Month, ~`Personal Vehicle Passengers`, ~`Pedestrians`,
"Brownsville", "Texas", 2301, 2019, "01/04/2019", 848900, NA
)
df <- as.data.frame(df)
df$Port.Name <- as.factor(df$Port.Name)
df$State <- as.factor(df$State)
df$Year <- as.factor(df$Year)
df$Month <- month(as.Date.character(df$Month), label = T, abbr = )
df$`Personal Vehicle Passengers` <- as.numeric(df$`Personal Vehicle Passengers`)
df$Pedestrians <- as.numeric(df$Pedestrians)
data_1 <- sb_nest %>% filter(Year == 2018L) %>%
select(-Ped.Model, -Residuals) %>% unnest(data)
reg <- lm(formula = Pedestrians ~ Month + State +
Port.Code + Port.Name +
`Personal Vehicle Passengers`,
data = data_1)
df$Pedestrians <- predict(reg, newdata = df)
df
## Port.Name State Port.Code Year Month Personal Vehicle Passengers
## 1 Brownsville Texas 2301 2019 Apr 848900
## Pedestrians
## 1 242503.4
So the model made a prediction assuming that in Brownsville, Texas on April, 04 2019 if there were 848,900 Personal Vehicle Passengers crossings during the entire month, that there would be an estimated 242,503 Pedestrian crossings.
We can compare that to prior years.
sb[sb$Port.Name == "Brownsville" & sb$Year == "2018" &
sb$Month == "Apr" & sb$Measure == c("Personal Vehicle Passengers", "Pedestrians"),]
## Port.Name State Port.Code Border Date
## 2192 Brownsville Texas 2301 US-Mexico Border 2018-04-01
## 2227 Brownsville Texas 2301 US-Mexico Border 2018-04-01
## Measure Value long lat Year Month
## 2192 Pedestrians 243547 -97.49722 25.90139 2018 Apr
## 2227 Personal Vehicle Passengers 857662 -97.49722 25.90139 2018 Apr
sb[sb$Port.Name == "Brownsville" & sb$Year == "2017" &
sb$Month == "Apr" & sb$Measure == c("Personal Vehicle Passengers", "Pedestrians"),]
## Port.Name State Port.Code Border Date
## 4430 Brownsville Texas 2301 US-Mexico Border 2017-04-01
## 4491 Brownsville Texas 2301 US-Mexico Border 2017-04-01
## Measure Value long lat Year Month
## 4430 Pedestrians 238506 -97.49722 25.90139 2017 Apr
## 4491 Personal Vehicle Passengers 868605 -97.49722 25.90139 2017 Apr
sb[sb$Port.Name == "Brownsville" & sb$Year == "2016" &
sb$Month == "Apr" & sb$Measure == c("Personal Vehicle Passengers", "Pedestrians"),]
## Port.Name State Port.Code Border Date
## 7307 Brownsville Texas 2301 US-Mexico Border 2016-04-01
## 7442 Brownsville Texas 2301 US-Mexico Border 2016-04-01
## Measure Value long lat Year Month
## 7307 Personal Vehicle Passengers 746743 -97.5 25.89 2016 Apr
## 7442 Pedestrians 202669 -97.5 25.89 2016 Apr
So that’s not too bad. The proportions are relatively consistent.
I have also made an interactive map of the data. If you wish to see it, please visit my page at “https://writetrevorjackson.shinyapps.io/South_Border/”
Below, is the code that I authored for my page. Note that I have put it in quotations, so it won’t be executed in the markdown document.
'"library(shiny)
library(shinydashboard)
library(dplyr)
library(tidyr)
library(stringr)
library(lubridate)
library(leaflet)
library(htmltools)
border <- read.csv("Border_crossing_Entry_data.csv")
sb <- border %>% filter(Border == "US-Mexico Border") %>%
droplevels.data.frame()
sb.1 <- sb
sb$Location <- gsub("POINT ", "", sb$Location)
sb$Location <- gsub("\\(", "", sb$Location)
sb$Location <- gsub(")", "", sb$Location)
sb <- sb %>% separate(Location, into = c("long", "lat"), sep = " ")
sb$lat <- as.numeric(sb$lat)
sb$long <- as.numeric(sb$long)
sb$Date <- as.Date.character(sb$Date, format = "%m/%d/%Y")
head(sb$Date)
sb <- sb %>% mutate(Year = year(Date), Month = month(Date, label = T, abbr = T))
summary(sb$Measure)
ui <- dashboardPage(
skin = "green",
dashboardHeader(title = "Southern Border"),
dashboardSidebar(
selectInput("year_range", label = "Year",
choices = c( "1996" = 1996L, "1997" = 1997L, "1998" = 1998L,
"1999" = 1999L, "2000" = 2000L, "2001" = 2001L,
"2002" = 2002L, "2003" = 2003L, "2004" = 2004L,
"2005" = 2005L, "2006" = 2006L, "2007" = 2007L,
"2008" = 2008L, "2009" = 2009L, "2010" = 2010L,
"2011" = 2011L, "2012" = 2012L, "2013" = 2013L,
"2014" = 2014L, "2015" = 2015L, "2016" = 2016L,
"2017" = 2017L, "2018" = 2018L, "2019" = 2019L),
selected = "2019",
selectize = F),
radioButtons("measure", label = "Measure",
choices = c(
"Bus Passengers" ="Bus Passengers",
"Buses" = "Buses",
"Pedestrians" = "Pedestrians",
"Personal Vehicle Passengers" = "Personal Vehicle Passengers",
"Personal Vehicles" = "Personal Vehicles",
"Rail Containers Empty" = "Rail Containers Empty",
"Rail Containers Full" = "Rail Containers Full",
"Trains" = "Trains",
"Truck Containers Empty" = "Truck Containers Empty",
"Truck Containers Full" = "Truck Containers Full",
"Trucks" = "Trucks"
))
),
dashboardBody(
fluidRow(box(width = 12, leafletOutput(outputId = "mymap"))),
fluidRow(box(width = 12, dataTableOutput(outputId = "summary")))
)
)
server <- function(input, output) {
data_input <- reactive({ sb %>%
filter(Year == input$year_range) %>%
filter(Measure == input$measure) %>%
group_by(State, Measure, Port.Code, Port.Name, long, lat) %>%
summarize(Mean = round(mean(Value), 2))
})
labels <- reactive({
paste("<p> Port Name:", data_input()$Port.Name, "</p>",
"<p> Port Code:", data_input()$Port.Code, "</p",
"<p>", data_input()$State, "</p>",
"<p>", data_input()$Measure, "</p>",
"<p> Average Value:", data_input()$Mean, "</p>")
})
output$mymap <- renderLeaflet(
leaflet() %>%
addProviderTiles(providers$OpenStreetMap.France) %>%
setView(-105, 29, zoom = 4.9) %>%
addMarkers(
data = data_input(),
lng = data_input()$long,
lat = data_input()$lat,
label = lapply(labels(), HTML)
) )
output$summary <- renderDataTable(data_input())
}
shinyApp(ui = ui, server = server)
"'