Urban traffic congestion negatively impacts travel efficiency and quality of life. This project analyzes traffic data for New York City (NYC) to understand traffic patterns, predict congestion, and provide actionable insights for mitigation strategies.The data was collected from NYC OpenData source https://opendata.cityofnewyork.us/.
The dataset contains traffic data, including fields such as
RequestID, Boro, Vol,
Direction, and WktGeom. The goal is to
identify congestion patterns across time and locations, focusing on peak
times, high-traffic boroughs, and specific hotspots.
Expected outcomes include identifying areas with significant congestion, visualizing temporal patterns, and providing insights for better traffic management. The results will help identify peak congestion times, high-traffic areas, and patterns over time, ultimately assisting in urban planning and policy formulation.
# Load necessary libraries
library(tidyverse) # For data manipulation and visualization
library(lubridate) # For handling date and time data
library(ggplot2) # For creating plots
library(sf) # For geospatial data analysis
library(DT) # For interactive data tables
# Load the CSV file
traffic_data <- read.csv("C:/Users/unnia/Downloads/DAta607/final project/Automated_Traffic_Volume_Counts_20241207.csv")
# Display the first few rows of the data
head(traffic_data)
## RequestID Boro Yr M D HH MM Vol SegmentID
## 1 32970 Queens 2021 4 30 2 0 0 149701
## 2 32970 Queens 2021 4 30 2 15 1 149701
## 3 32970 Queens 2021 4 30 2 30 0 149701
## 4 32970 Queens 2021 4 30 2 45 0 149701
## 5 32970 Queens 2021 4 30 3 0 1 149701
## 6 32970 Queens 2021 4 30 3 15 0 149701
## WktGeom street
## 1 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 2 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 3 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 4 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 5 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 6 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## fromSt toSt Direction
## 1 Newtown Creek Shoreline Dead end NB
## 2 Newtown Creek Shoreline Dead end NB
## 3 Newtown Creek Shoreline Dead end NB
## 4 Newtown Creek Shoreline Dead end NB
## 5 Newtown Creek Shoreline Dead end NB
## 6 Newtown Creek Shoreline Dead end NB
# Inspect the structure of the dataset
str(traffic_data)
## 'data.frame': 1712605 obs. of 14 variables:
## $ RequestID: int 32970 32970 32970 32970 32970 32970 32970 32970 32970 32970 ...
## $ Boro : chr "Queens" "Queens" "Queens" "Queens" ...
## $ Yr : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
## $ M : int 4 4 4 4 4 4 4 4 4 4 ...
## $ D : int 30 30 30 30 30 30 30 30 30 30 ...
## $ HH : int 2 2 2 2 3 3 3 3 4 4 ...
## $ MM : int 0 15 30 45 0 15 30 45 0 15 ...
## $ Vol : int 0 1 0 0 1 0 0 1 0 0 ...
## $ SegmentID: int 149701 149701 149701 149701 149701 149701 149701 149701 149701 149701 ...
## $ WktGeom : chr "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" ...
## $ street : chr "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" ...
## $ fromSt : chr "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" ...
## $ toSt : chr "Dead end" "Dead end" "Dead end" "Dead end" ...
## $ Direction: chr "NB" "NB" "NB" "NB" ...
# Convert date and time fields
traffic_data <- traffic_data %>%
mutate(
Date = make_date(Yr, M, D), # Create a proper date field
Time = sprintf("%02d:%02d:%02d", HH, MM, 0), # Format time as "HH:MM:SS"
DateTime = as.POSIXct(paste(Date, Time), format="%Y-%m-%d %H:%M:%S") # Combine Date and Time
)
# Remove duplicate rows
traffic_data <- traffic_data %>%
distinct()
# Check for missing values in each column
missing_values <- colSums(is.na(traffic_data))
print(missing_values)
## RequestID Boro Yr M D HH MM Vol
## 0 0 0 0 0 0 0 0
## SegmentID WktGeom street fromSt toSt Direction Date Time
## 0 0 0 0 0 0 0 0
## DateTime
## 0
# View cleaned dataset
datatable(head(traffic_data))
# Replace negative Volume values with NA
traffic_data$Vol[traffic_data$Vol < 0] <- NA
# Replace NA values in Volume with the median of the column
traffic_data$Vol[is.na(traffic_data$Vol)] <- median(traffic_data$Vol, na.rm = TRUE)
# Group data by hour, aggregate vehicle volume, and extract relevant columns
hourly_traffic <- traffic_data %>%
mutate(Hour = HH) %>%
group_by(Date, Hour, street, fromSt, toSt) %>%
summarise(Total_Vehicles = sum(Vol, na.rm = TRUE), .groups = "drop")
# View the first few rows of the tidied data
datatable(head(hourly_traffic))
# Aggregate volume by time
time_volume <- traffic_data %>%
group_by(DateTime) %>%
summarise(Total_Vol = sum(Vol))
# Plot volume over time
ggplot(time_volume, aes(x = DateTime, y = Total_Vol)) +
geom_line(color = "blue") +
labs(
title = "Traffic Volume Over Time",
x = "Time",
y = "Volume"
) +
theme_minimal() +
scale_x_datetime(date_labels = "%H:%M", date_breaks = "2 hours")
# Aggregate volume by borough
borough_volume <- traffic_data %>%
group_by(Boro) %>%
summarise(Total_Vol = sum(Vol))
# Bar plot of volume by borough with annotations
ggplot(borough_volume, aes(x = Boro, y = Total_Vol, fill = Boro)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Total_Vol), vjust = -0.3, size = 3.5) + # Add annotations
labs(
title = "Traffic Volume by Borough",
x = "Borough",
y = "Volume"
) +
theme_minimal()
# Filter data for a specific station "PULASKI BRIDGE"
station_data <- hourly_traffic %>%
filter(street == "PULASKI BRIDGE")
# Aggregate by day and hour to calculate average volume for the same hour across days
station_hourly_avg <- station_data %>%
group_by(Hour) %>%
summarise(Average_Volume = mean(Total_Vehicles, na.rm = TRUE), .groups = "drop")
# View the calculated averages
datatable(station_hourly_avg)
# Plot average traffic volume for each hour of the day
ggplot(station_hourly_avg, aes(x = Hour, y = Average_Volume)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(
title = "Average Hourly Traffic Volume at PULASKI BRIDGE",
x = "Hour of the Day",
y = "Average Traffic Volume"
) +
theme_minimal() +
scale_x_continuous(breaks = seq(0, 23, 1))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
# Filter data for valid WKT geometries
traffic_data <- traffic_data %>% filter(!is.na(WktGeom))
traffic_data <- traffic_data %>% filter(grepl("^POINT\\s\\(.+\\)$", WktGeom))
# Convert to spatial data
traffic_sf <- st_as_sf(traffic_data, wkt = "WktGeom", crs = 4326)
# Fix invalid geometries
traffic_sf <- traffic_sf %>% st_make_valid()
## Warning in st_is_longlat(x): bounding box has potentially an invalid value
## range for longlat data
# Plot congestion hotspots
ggplot(data = traffic_sf) +
geom_sf(aes(color = Vol)) +
labs(
title = "Congestion Hotspots",
color = "Volume"
) +
theme_minimal()
# Save cleaned data for future use
write.csv(traffic_data, "cleaned_traffic_data.csv", row.names = FALSE)
# Save processed R object
saveRDS(traffic_sf, "traffic_sf.rds")
This analysis provided insights into traffic congestion patterns across time and locations. The results highlight peak congestion hours, high-traffic boroughs, and specific hotspots such as the PULASKI BRIDGE.
Future work could involve integrating external factors such as weather or special events and leveraging predictive models to forecast congestion. These findings can help policymakers and urban planners optimize traffic management strategies.