This project dives into Uber pickup patterns in New York City, examining trends across different time frames. We’ll explore the overall volume of rides and uncover how factors like day of the week and hour of the day influence pickup frequency. By analyzing these trends, we can gain valuable insights into Uber demand and usage patterns across the city.
library(ggplot2)
library(ggthemes)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(DT)
library(scales)
colors = c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")
colors
## [1] "#CC1011" "#665555" "#05a399" "#cfcaca" "#f5e840" "#0683c9" "#e075b0"
# Import each month
apr_data <- read.csv("uber-raw-data-apr14.csv")
may_data <- read.csv("uber-raw-data-may14.csv")
jun_data <- read.csv("uber-raw-data-jun14.csv")
jul_data <- read.csv("uber-raw-data-jul14.csv")
aug_data <- read.csv("uber-raw-data-aug14.csv")
sep_data <- read.csv("uber-raw-data-sep14.csv")
# Combine all the data
data <- rbind(apr_data, may_data, jun_data, jul_data, aug_data, sep_data)
View(data)
cat("The dimensions of the data are:", ncol(data), "columns", "and", nrow(data), "rows")
## The dimensions of the data are: 4 columns and 4534327 rows
head(data)
## Date.Time Lat Lon Base
## 1 4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2 4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3 4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4 4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5 4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6 4/1/2014 0:33:00 40.7383 -74.0403 B02512
The data contains the following columns: * Date.Time: which is a factor * Lat: Latitude * Lon: Longitudes * Base: which is factor.
We will format the datetime into a more readable format using the Date Time conversion function.
data$Date.Time <- as.POSIXct(data$Date.Time, format = "%m/%d/%Y %H:%M:%S")
data$Time <- format(as.POSIXct(data$Date.Time, format = "%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
data$Date.Time <- ymd_hms(data$Date.Time)
## Warning: 2211 failed to parse.
# Create columns for day of the week, date, month, and year
data$dayofweek <- factor(wday(data$Date.Time, label=TRUE))
data$date <- factor(day(data$Date.Time))
data$month <- factor(month(data$Date.Time, label=TRUE))
data$year <- factor(year(data$Date.Time))
# Create columns for time
data$hour = factor(hour(hms(data$Time)))
data$minute = factor(minute(hms(data$Time)))
data$second = factor(second(hms(data$Time)))
# View data
head(data)
## Date.Time Lat Lon Base Time dayofweek date month
## 1 2014-04-01 00:11:00 40.7690 -73.9549 B02512 00:11:00 Tue 1 Apr
## 2 2014-04-01 00:17:00 40.7267 -74.0345 B02512 00:17:00 Tue 1 Apr
## 3 2014-04-01 00:21:00 40.7316 -73.9873 B02512 00:21:00 Tue 1 Apr
## 4 2014-04-01 00:28:00 40.7588 -73.9776 B02512 00:28:00 Tue 1 Apr
## 5 2014-04-01 00:33:00 40.7594 -73.9722 B02512 00:33:00 Tue 1 Apr
## 6 2014-04-01 00:33:00 40.7383 -74.0403 B02512 00:33:00 Tue 1 Apr
## year hour minute second
## 1 2014 0 11 0
## 2 2014 0 17 0
## 3 2014 0 21 0
## 4 2014 0 28 0
## 5 2014 0 33 0
## 6 2014 0 33 0
Check for missing values in each column
colSums(is.na(data))
## Date.Time Lat Lon Base Time dayofweek date month
## 2211 0 0 0 0 2211 2211 2211
## year hour minute second
## 2211 0 0 0
Remove missing values
clean_data <- na.omit(data)
colSums(is.na(clean_data))
## Date.Time Lat Lon Base Time dayofweek date month
## 0 0 0 0 0 0 0 0
## year hour minute second
## 0 0 0 0
summary(clean_data)
## Date.Time Lat Lon
## Min. :2014-04-01 00:01:00.00 Min. :39.66 Min. :-74.93
## 1st Qu.:2014-05-28 15:16:00.00 1st Qu.:40.72 1st Qu.:-74.00
## Median :2014-07-17 14:43:00.00 Median :40.74 Median :-73.98
## Mean :2014-07-11 18:49:41.19 Mean :40.74 Mean :-73.97
## 3rd Qu.:2014-08-27 21:55:00.00 3rd Qu.:40.76 3rd Qu.:-73.97
## Max. :2014-09-30 22:59:00.00 Max. :42.12 Max. :-72.07
##
## Base Time dayofweek date
## Length:4532116 Length:4532116 Sun:489578 30 : 167101
## Class :character Class :character Mon:541341 12 : 160532
## Mode :character Mode :character Tue:663651 16 : 158867
## Wed:696296 13 : 156767
## Thu:754940 23 : 155958
## Fri:740778 9 : 155077
## Sat:645532 (Other):3577814
## month year hour minute second
## Apr: 564264 2014:4532116 17 : 336190 10 : 77757 0:4532116
## May: 652124 18 : 324679 14 : 77161
## Jun: 663545 16 : 313400 15 : 77124
## Jul: 795732 19 : 294513 13 : 76957
## Aug: 828805 20 : 284604 12 : 76849
## Sep:1027646 21 : 281460 8 : 76719
## (Other):2697270 (Other):4069549
hourly_data <- clean_data %>%
group_by(hour) %>%
dplyr::summarize(Total = n())
datatable(hourly_data)
ggplot(hourly_data, aes(hour, Total)) +
geom_bar(stat = "identity",fill = "lightblue") +
ggtitle("Uber - Trips by Hour") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma)
month_hour_data <- clean_data %>%
group_by(month, hour) %>%
dplyr::summarize(Total = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(month_hour_data, aes(hour, Total, fill = month)) +
geom_bar(stat = "identity") +
ggtitle("Uber - Trips by Hour and Month") +
scale_y_continuous(labels = comma)
date_data <- clean_data %>%
group_by(date) %>%
dplyr::summarize(Trips = n())
ggplot(date_data, aes(date, Trips)) +
geom_bar(stat = "identity", fill = "coral") +
ggtitle("Uber - Trips by date of the month") +
theme(legend.position = "none")
scale_y_continuous(labels = comma)
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
month_data <- clean_data %>%
group_by(month) %>%
dplyr::summarize(Total = n())
ggplot(month_data, aes(month, Total)) +
geom_bar(stat = "identity", fill = "lightcoral") +
ggtitle("Uber - Trips by month") +
theme(legend.position = "none")
scale_y_continuous(labels = comma)
## <ScaleContinuousPosition>
## Range:
## Limits: 0 -- 1
ggplot(clean_data, aes(Base)) +
geom_bar(fill = "darkred") +
scale_y_continuous(labels = comma) +
ggtitle("Total Trips by Bases")
ggplot(clean_data, aes(Base, fill = month)) +
geom_bar(position = "dodge") +
scale_y_continuous(labels = comma) +
ggtitle("Total Trips by Bases and Month") +
scale_fill_manual(values = colors)
# Heatmap by Date and Hour
date_hour_data <- clean_data %>%
group_by(date, hour) %>%
summarize(Total = n())
## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.
ggplot(date_hour_data, aes(date, hour, fill = Total)) +
geom_tile(color = "gray") +
ggtitle("Heat Map by Date and Hour")
# Heatmap by Month and Date
month_date_data <- clean_data %>%
group_by(month, date) %>%
summarize(Total = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(month_date_data, aes(date, month, fill = Total)) +
geom_tile(color = "gray") +
ggtitle("Heat Map by Month and Date")
# Heatmap by Month and Day of the Week
month_day_data <- clean_data %>%
group_by(month, dayofweek) %>%
summarize(Total = n())
## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.
ggplot(month_day_data, aes(dayofweek, month, fill = Total)) +
geom_tile(color = "gray") +
ggtitle("Heat Map by Month and Day of the Week")
# Map Visualization
# Set Map Constants
min_lat <- 40.5774
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004
ggplot(data, aes(x = Lon, y = Lat)) +
geom_point(size = 1, color = "blue") +
scale_x_continuous(limits = c(min_long, max_long)) +
scale_y_continuous(limits = c(min_lat, max_lat)) +
theme_map() +
ggtitle("NYC Map Based on Uber Rides from Apr-24 to Sep-24")
## Warning: Removed 71701 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Map Visualization
# Set Map Constants
min_lat <- 40.5774
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004
ggplot(data, aes(x = Lon, y = Lat, color = Base)) +
geom_point(size = 1) +
scale_x_continuous(limits = c(min_long, max_long)) +
scale_y_continuous(limits = c(min_lat, max_lat)) +
theme_map() +
ggtitle("NYC Map Based on Uber Rides from Apr-24 to Sep-24 by BASE")
## Warning: Removed 71701 rows containing missing values or values outside the scale range
## (`geom_point()`).