Analyzing Traffic Patterns and Predicting Congestion

Introduction

Urban traffic congestion negatively impacts travel efficiency and quality of life. This project analyzes traffic data for New York City (NYC) to understand traffic patterns, predict congestion, and provide actionable insights for mitigation strategies.The data was collected from NYC OpenData source https://opendata.cityofnewyork.us/.

The dataset contains traffic data, including fields such as RequestID, Boro, Vol, Direction, and WktGeom. The goal is to identify congestion patterns across time and locations, focusing on peak times, high-traffic boroughs, and specific hotspots.

Expected outcomes include identifying areas with significant congestion, visualizing temporal patterns, and providing insights for better traffic management. The results will help identify peak congestion times, high-traffic areas, and patterns over time, ultimately assisting in urban planning and policy formulation.

Data Loading and Preparation

Load Required Libraries

# Load necessary libraries
library(tidyverse)  # For data manipulation and visualization
library(lubridate)  # For handling date and time data
library(ggplot2)    # For creating plots
library(sf)         # For geospatial data analysis
library(DT)         # For interactive data tables

Load Data

# Load the CSV file
traffic_data <- read.csv("C:/Users/unnia/Downloads/DAta607/final project/Automated_Traffic_Volume_Counts_20241207.csv")

# Display the first few rows of the data
head(traffic_data)

##   RequestID   Boro   Yr M  D HH MM Vol SegmentID
## 1     32970 Queens 2021 4 30  2  0   0    149701
## 2     32970 Queens 2021 4 30  2 15   1    149701
## 3     32970 Queens 2021 4 30  2 30   0    149701
## 4     32970 Queens 2021 4 30  2 45   0    149701
## 5     32970 Queens 2021 4 30  3  0   1    149701
## 6     32970 Queens 2021 4 30  3 15   0    149701
##                                        WktGeom         street
## 1 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 2 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 3 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 4 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 5 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 6 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
##                    fromSt     toSt Direction
## 1 Newtown Creek Shoreline Dead end        NB
## 2 Newtown Creek Shoreline Dead end        NB
## 3 Newtown Creek Shoreline Dead end        NB
## 4 Newtown Creek Shoreline Dead end        NB
## 5 Newtown Creek Shoreline Dead end        NB
## 6 Newtown Creek Shoreline Dead end        NB

# Inspect the structure of the dataset
str(traffic_data)

## 'data.frame':    1712605 obs. of  14 variables:
##  $ RequestID: int  32970 32970 32970 32970 32970 32970 32970 32970 32970 32970 ...
##  $ Boro     : chr  "Queens" "Queens" "Queens" "Queens" ...
##  $ Yr       : int  2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
##  $ M        : int  4 4 4 4 4 4 4 4 4 4 ...
##  $ D        : int  30 30 30 30 30 30 30 30 30 30 ...
##  $ HH       : int  2 2 2 2 3 3 3 3 4 4 ...
##  $ MM       : int  0 15 30 45 0 15 30 45 0 15 ...
##  $ Vol      : int  0 1 0 0 1 0 0 1 0 0 ...
##  $ SegmentID: int  149701 149701 149701 149701 149701 149701 149701 149701 149701 149701 ...
##  $ WktGeom  : chr  "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" ...
##  $ street   : chr  "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" ...
##  $ fromSt   : chr  "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" ...
##  $ toSt     : chr  "Dead end" "Dead end" "Dead end" "Dead end" ...
##  $ Direction: chr  "NB" "NB" "NB" "NB" ...

Data Cleaning

Initial Cleaning

# Convert date and time fields
traffic_data <- traffic_data %>%
  mutate(
    Date = make_date(Yr, M, D),  # Create a proper date field
    Time = sprintf("%02d:%02d:%02d", HH, MM, 0), # Format time as "HH:MM:SS"
    DateTime = as.POSIXct(paste(Date, Time), format="%Y-%m-%d %H:%M:%S") # Combine Date and Time
  )

# Remove duplicate rows
traffic_data <- traffic_data %>%
  distinct()

# Check for missing values in each column
missing_values <- colSums(is.na(traffic_data))
print(missing_values)

## RequestID      Boro        Yr         M         D        HH        MM       Vol 
##         0         0         0         0         0         0         0         0 
## SegmentID   WktGeom    street    fromSt      toSt Direction      Date      Time 
##         0         0         0         0         0         0         0         0 
##  DateTime 
##         0

# View cleaned dataset
datatable(head(traffic_data))

Handling Negative and Missing Values

# Replace negative Volume values with NA
traffic_data$Vol[traffic_data$Vol < 0] <- NA

# Replace NA values in Volume with the median of the column
traffic_data$Vol[is.na(traffic_data$Vol)] <- median(traffic_data$Vol, na.rm = TRUE)

Data Tidying and Aggregation

Hourly Traffic Extraction

# Group data by hour, aggregate vehicle volume, and extract relevant columns
hourly_traffic <- traffic_data %>%
  mutate(Hour = HH) %>%
  group_by(Date, Hour, street, fromSt, toSt) %>%
  summarise(Total_Vehicles = sum(Vol, na.rm = TRUE), .groups = "drop")

# View the first few rows of the tidied data
datatable(head(hourly_traffic))

Data Exploration

Traffic Volume Over Time

# Aggregate volume by time
time_volume <- traffic_data %>%
  group_by(DateTime) %>%
  summarise(Total_Vol = sum(Vol))

# Plot volume over time
ggplot(time_volume, aes(x = DateTime, y = Total_Vol)) +
  geom_line(color = "blue") +
  labs(
    title = "Traffic Volume Over Time",
    x = "Time",
    y = "Volume"
  ) +
  theme_minimal() +
  scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours")

Traffic Volume by Borough

# Aggregate volume by borough
borough_volume <- traffic_data %>%
  group_by(Boro) %>%
  summarise(Total_Vol = sum(Vol))

# Bar plot of volume by borough with annotations
ggplot(borough_volume, aes(x = Boro, y = Total_Vol, fill = Boro)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Total_Vol), vjust = -0.3, size = 3.5) +  # Add annotations
  labs(
    title = "Traffic Volume by Borough",
    x = "Borough",
    y = "Volume"
  ) +
  theme_minimal()

Focus on a Specific Station

# Filter data for a specific station "PULASKI BRIDGE"
station_data <- hourly_traffic %>%
  filter(street == "PULASKI BRIDGE")

# Aggregate by day and hour to calculate average volume for the same hour across days
station_hourly_avg <- station_data %>%
  group_by(Hour) %>%
  summarise(Average_Volume = mean(Total_Vehicles, na.rm = TRUE), .groups = "drop")

# View the calculated averages
datatable(station_hourly_avg)

Plotting High and Low Traffic Hours

# Plot average traffic volume for each hour of the day
ggplot(station_hourly_avg, aes(x = Hour, y = Average_Volume)) +
  geom_line(color = "blue", size = 1) +
  geom_point(color = "red", size = 2) +
  labs(
    title = "Average Hourly Traffic Volume at PULASKI BRIDGE",
    x = "Hour of the Day",
    y = "Average Traffic Volume"
  ) +
  theme_minimal() +
  scale_x_continuous(breaks = seq(0, 23, 1))

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Geospatial Analysis

Congestion Hotspots

# Filter data for valid WKT geometries
traffic_data <- traffic_data %>% filter(!is.na(WktGeom))
traffic_data <- traffic_data %>% filter(grepl("^POINT\\s\\(.+\\)$", WktGeom))

# Convert to spatial data
traffic_sf <- st_as_sf(traffic_data, wkt = "WktGeom", crs = 4326)

# Fix invalid geometries
traffic_sf <- traffic_sf %>% st_make_valid()

## Warning in st_is_longlat(x): bounding box has potentially an invalid value
## range for longlat data

# Plot congestion hotspots
ggplot(data = traffic_sf) +
  geom_sf(aes(color = Vol)) +
  labs(
    title = "Congestion Hotspots",
    color = "Volume"
  ) +
  theme_minimal()

Data Saving and Sharing

Saving Cleaned Data

# Save cleaned data for future use
write.csv(traffic_data, "cleaned_traffic_data.csv", row.names = FALSE)

# Save processed R object
saveRDS(traffic_sf, "traffic_sf.rds")

Conclusion

This analysis provided insights into traffic congestion patterns across time and locations. The results highlight peak congestion hours, high-traffic boroughs, and specific hotspots such as the PULASKI BRIDGE.

Future work could involve integrating external factors such as weather or special events and leveraging predictive models to forecast congestion. These findings can help policymakers and urban planners optimize traffic management strategies.