MD Asadul Islam & Md Simon Chowdhury
2024-12-18
RequestID, Boro, Vol,
Direction, WktGeom.library(tidyverse) # Data manipulation
library(lubridate) # Date handling
library(ggplot2) # Visualization
library(sf) # Geospatial data
library(DT) # Interactive tables## RequestID Boro Yr M D HH MM Vol SegmentID
## 1 32970 Queens 2021 4 30 2 0 0 149701
## 2 32970 Queens 2021 4 30 2 15 1 149701
## 3 32970 Queens 2021 4 30 2 30 0 149701
## 4 32970 Queens 2021 4 30 2 45 0 149701
## 5 32970 Queens 2021 4 30 3 0 1 149701
## 6 32970 Queens 2021 4 30 3 15 0 149701
## WktGeom street
## 1 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 2 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 3 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 4 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 5 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## 6 POINT (997407.0998491726 208620.92612708386) PULASKI BRIDGE
## fromSt toSt Direction
## 1 Newtown Creek Shoreline Dead end NB
## 2 Newtown Creek Shoreline Dead end NB
## 3 Newtown Creek Shoreline Dead end NB
## 4 Newtown Creek Shoreline Dead end NB
## 5 Newtown Creek Shoreline Dead end NB
## 6 Newtown Creek Shoreline Dead end NB
## 'data.frame': 1712605 obs. of 14 variables:
## $ RequestID: int 32970 32970 32970 32970 32970 32970 32970 32970 32970 32970 ...
## $ Boro : chr "Queens" "Queens" "Queens" "Queens" ...
## $ Yr : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
## $ M : int 4 4 4 4 4 4 4 4 4 4 ...
## $ D : int 30 30 30 30 30 30 30 30 30 30 ...
## $ HH : int 2 2 2 2 3 3 3 3 4 4 ...
## $ MM : int 0 15 30 45 0 15 30 45 0 15 ...
## $ Vol : int 0 1 0 0 1 0 0 1 0 0 ...
## $ SegmentID: int 149701 149701 149701 149701 149701 149701 149701 149701 149701 149701 ...
## $ WktGeom : chr "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" "POINT (997407.0998491726 208620.92612708386)" ...
## $ street : chr "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" "PULASKI BRIDGE" ...
## $ fromSt : chr "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" "Newtown Creek Shoreline" ...
## $ toSt : chr "Dead end" "Dead end" "Dead end" "Dead end" ...
## $ Direction: chr "NB" "NB" "NB" "NB" ...
Initial Observations: - Rows: 1712605 - Columns: 14
traffic_data <- traffic_data %>%
mutate(
Date = make_date(Yr, M, D),
Time = sprintf("%02d:%02d:%02d", HH, MM, 0),
DateTime = as.POSIXct(paste(Date, Time), format="%Y-%m-%d %H:%M:%S")
) %>%
filter(Vol >= 0) %>%
distinct()
# Check for missing values in each column
missing_values <- colSums(is.na(traffic_data))
traffic_data$Vol[traffic_data$Vol < 0] <- NA
traffic_data$Vol[is.na(traffic_data$Vol)] <- median(traffic_data$Vol, na.rm = TRUE)NA.time_volume <- traffic_data %>%
group_by(DateTime) %>%
summarise(Total_Vol = sum(Vol))
ggplot(time_volume, aes(x = DateTime, y = Total_Vol)) +
geom_line(color = "blue", size = 1) +
labs(
title = "Traffic Volume Over Time",
x = "Time",
y = "Volume"
) +
theme_minimal()## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
borough_volume <- traffic_data %>%
group_by(Boro) %>%
summarise(Total_Vol = sum(Vol))
ggplot(borough_volume, aes(x = Boro, y = Total_Vol, fill = Boro)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Total_Vol), vjust = -0.5) +
labs(
title = "Traffic Volume by Borough",
x = "Borough",
y = "Volume"
) +
theme_minimal()station_data <- traffic_data %>%
filter(street == "PULASKI BRIDGE") %>%
mutate(Hour = HH) %>%
group_by(Hour) %>%
summarise(Average_Volume = mean(Vol, na.rm = TRUE))
ggplot(station_data, aes(x = Hour, y = Average_Volume)) +
geom_line(color = "blue", size = 1) +
geom_point(color = "red", size = 2) +
labs(
title = "Hourly Traffic Volume at Pulaski Bridge",
x = "Hour of the Day",
y = "Average Traffic Volume"
) +
theme_minimal()# Filter data for valid WKT geometries
traffic_data <- traffic_data %>% filter(!is.na(WktGeom))
traffic_data <- traffic_data %>% filter(grepl("^POINT\\s\\(.+\\)$", WktGeom))
# Convert to spatial data
traffic_sf <- st_as_sf(traffic_data, wkt = "WktGeom", crs = 4326)
# Fix invalid geometries
traffic_sf <- traffic_sf %>% st_make_valid()## Warning in st_is_longlat(x): bounding box has potentially an invalid value
## range for longlat data
# Plot congestion hotspots
ggplot(data = traffic_sf) +
geom_sf(aes(color = Vol)) +
labs(
title = "Congestion Hotspots",
color = "Volume"
) +
theme_minimal()