Analyzing Traffic Patterns and Predicting Congestion

MD Asadul Islam & Md Simon Chowdhury

2024-12-18

Introduction

Data Preparation

Required Libraries

library(tidyverse)  # Data manipulation
library(lubridate)  # Date handling
library(ggplot2)    # Visualization
library(sf)         # Geospatial data
library(DT)         # Interactive tables

Loading and Inspecting Data

##   RequestID   Boro   Yr M  D HH MM Vol SegmentID
## 1     32970 Queens 2021 4 30  2  0   0    149701
## 2     32970 Queens 2021 4 30  2 15   1    149701
## 3     32970 Queens 2021 4 30  2 30   0    149701
## 4     32970 Queens 2021 4 30  2 45   0    149701
## 5     32970 Queens 2021 4 30  3  0   1    149701
## 6     32970 Queens 2021 4 30  3 15   0    149701
##                                        WktGeom         street
## 1 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 2 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 3 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 4 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 5 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 6 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
##                    fromSt     toSt Direction
## 1 Newtown Creek Shoreline Dead end        NB
## 2 Newtown Creek Shoreline Dead end        NB
## 3 Newtown Creek Shoreline Dead end        NB
## 4 Newtown Creek Shoreline Dead end        NB
## 5 Newtown Creek Shoreline Dead end        NB
## 6 Newtown Creek Shoreline Dead end        NB
## 'data.frame':    1712605 obs. of  14 variables:
##  $ RequestID: int  32970 32970 32970 32970 32970 32970 32970 32970 32970 32970 ...
##  $ Boro     : chr  "Queens" "Queens" "Queens" "Queens" ...
##  $ Yr       : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
##  $ M        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ D        : int  30 30 30 30 30 30 30 30 30 30 ...
##  $ HH       : int  2 2 2 2 3 3 3 3 4 4 ...
##  $ MM       : int  0 15 30 45 0 15 30 45 0 15 ...
##  $ Vol      : int  0 1 0 0 1 0 0 1 0 0 ...
##  $ SegmentID: int  149701 149701 149701 149701 149701 149701 149701 149701 149701 149701 ...
##  $ WktGeom  : chr  "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" ...
##  $ street   : chr  "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" ...
##  $ fromSt   : chr  "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" ...
##  $ toSt     : chr  "Dead end" "Dead end" "Dead end" "Dead end" ...
##  $ Direction: chr  "NB" "NB" "NB" "NB" ...

Initial Observations: - Rows: 1712605 - Columns: 14

Data Cleaning

Key Steps

traffic_data <- traffic_data %>%
  mutate(
    Date = make_date(Yr, M, D),
    Time = sprintf("%02d:%02d:%02d", HH, MM, 0),
    DateTime = as.POSIXct(paste(Date, Time), format="%Y-%m-%d %H:%M:%S")
  ) %>%
  filter(Vol >= 0) %>%
  distinct()

# Check for missing values in each column
missing_values <- colSums(is.na(traffic_data))

traffic_data$Vol[traffic_data$Vol < 0] <- NA
traffic_data$Vol[is.na(traffic_data$Vol)] <- median(traffic_data$Vol, na.rm = TRUE)

Exploratory Data Analysis

Volume Over Time

time_volume <- traffic_data %>%
  group_by(DateTime) %>%
  summarise(Total_Vol = sum(Vol))

ggplot(time_volume, aes(x = DateTime, y = Total_Vol)) +
  geom_line(color = "blue", size = 1) +
  labs(
    title = "Traffic Volume Over Time",
    x = "Time",
    y = "Volume"
  ) +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Traffic Volume by Borough

borough_volume <- traffic_data %>%
  group_by(Boro) %>%
  summarise(Total_Vol = sum(Vol))

ggplot(borough_volume, aes(x = Boro, y = Total_Vol, fill = Boro)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Total_Vol), vjust = -0.5) +
  labs(
    title = "Traffic Volume by Borough",
    x = "Borough",
    y = "Volume"
  ) +
  theme_minimal()

Focus on Congestion Hotspots

Pulaski Bridge Case Study

station_data <- traffic_data %>%
  filter(street == "PULASKI BRIDGE") %>%
  mutate(Hour = HH) %>%
  group_by(Hour) %>%
  summarise(Average_Volume = mean(Vol, na.rm = TRUE))

ggplot(station_data, aes(x = Hour, y = Average_Volume)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Hourly Traffic Volume at Pulaski Bridge",
    x = "Hour of the Day",
    y = "Average Traffic Volume"
  ) +
  theme_minimal()

Interactive Hotspot Map

# Filter data for valid WKT geometries
traffic_data <- traffic_data %>% filter(!is.na(WktGeom))
traffic_data <- traffic_data %>% filter(grepl("^POINT\\s\\(.+\\)$", WktGeom))

# Convert to spatial data
traffic_sf <- st_as_sf(traffic_data, wkt = "WktGeom", crs = 4326)

# Fix invalid geometries
traffic_sf <- traffic_sf %>% st_make_valid()
## Warning in st_is_longlat(x): bounding box has potentially an invalid value
## range for longlat data
# Plot congestion hotspots
ggplot(data = traffic_sf) +
  geom_sf(aes(color = Vol)) +
  labs(
    title = "Congestion Hotspots",
    color = "Volume"
  ) +
  theme_minimal()

Key Insights

Conclusion