I. Introduction

This project dives into Uber pickup patterns in New York City, examining trends across different time frames. We’ll explore the overall volume of rides and uncover how factors like day of the week and hour of the day influence pickup frequency. By analyzing these trends, we can gain valuable insights into Uber demand and usage patterns across the city.

II. Import Dataset

1. Load necessary libraries

library(ggplot2)
library(ggthemes)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(DT)
library(scales)

Create our own vector of colors for the plot

colors = c("#CC1011", "#665555", "#05a399", "#cfcaca", "#f5e840", "#0683c9", "#e075b0")
colors

## [1] "#CC1011" "#665555" "#05a399" "#cfcaca" "#f5e840" "#0683c9" "#e075b0"

Import dataset

# Import each month
apr_data <- read.csv("uber-raw-data-apr14.csv")
may_data <- read.csv("uber-raw-data-may14.csv")
jun_data <- read.csv("uber-raw-data-jun14.csv")
jul_data <- read.csv("uber-raw-data-jul14.csv")
aug_data <- read.csv("uber-raw-data-aug14.csv")
sep_data <- read.csv("uber-raw-data-sep14.csv")

# Combine all the data
data <- rbind(apr_data, may_data, jun_data, jul_data, aug_data, sep_data)
View(data)
cat("The dimensions of the data are:", ncol(data), "columns", "and", nrow(data), "rows")

## The dimensions of the data are: 4 columns and 4534327 rows

head(data)

##          Date.Time     Lat      Lon   Base
## 1 4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2 4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3 4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4 4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5 4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6 4/1/2014 0:33:00 40.7383 -74.0403 B02512

The data contains the following columns: * Date.Time: which is a factor * Lat: Latitude * Lon: Longitudes * Base: which is factor.

We will format the datetime into a more readable format using the Date Time conversion function.

data$Date.Time <- as.POSIXct(data$Date.Time, format = "%m/%d/%Y %H:%M:%S")
data$Time <- format(as.POSIXct(data$Date.Time, format = "%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
data$Date.Time <- ymd_hms(data$Date.Time)

## Warning: 2211 failed to parse.

# Create columns for day of the week, date, month, and year
data$dayofweek <- factor(wday(data$Date.Time, label=TRUE))
data$date <- factor(day(data$Date.Time))
data$month <- factor(month(data$Date.Time, label=TRUE))
data$year <- factor(year(data$Date.Time))

# Create columns for time
data$hour = factor(hour(hms(data$Time)))
data$minute = factor(minute(hms(data$Time)))
data$second = factor(second(hms(data$Time)))

# View data
head(data)

##             Date.Time     Lat      Lon   Base     Time dayofweek date month
## 1 2014-04-01 00:11:00 40.7690 -73.9549 B02512 00:11:00       Tue    1   Apr
## 2 2014-04-01 00:17:00 40.7267 -74.0345 B02512 00:17:00       Tue    1   Apr
## 3 2014-04-01 00:21:00 40.7316 -73.9873 B02512 00:21:00       Tue    1   Apr
## 4 2014-04-01 00:28:00 40.7588 -73.9776 B02512 00:28:00       Tue    1   Apr
## 5 2014-04-01 00:33:00 40.7594 -73.9722 B02512 00:33:00       Tue    1   Apr
## 6 2014-04-01 00:33:00 40.7383 -74.0403 B02512 00:33:00       Tue    1   Apr
##   year hour minute second
## 1 2014    0     11      0
## 2 2014    0     17      0
## 3 2014    0     21      0
## 4 2014    0     28      0
## 5 2014    0     33      0
## 6 2014    0     33      0

Check for missing values in each column

colSums(is.na(data))

## Date.Time       Lat       Lon      Base      Time dayofweek      date     month 
##      2211         0         0         0         0      2211      2211      2211 
##      year      hour    minute    second 
##      2211         0         0         0

Remove missing values

clean_data <- na.omit(data)
colSums(is.na(clean_data))

## Date.Time       Lat       Lon      Base      Time dayofweek      date     month 
##         0         0         0         0         0         0         0         0 
##      year      hour    minute    second 
##         0         0         0         0

III. Exploratory Data Analysis (EDA)

1. Data Overview

summary(clean_data)

##    Date.Time                           Lat             Lon        
##  Min.   :2014-04-01 00:01:00.00   Min.   :39.66   Min.   :-74.93  
##  1st Qu.:2014-05-28 15:16:00.00   1st Qu.:40.72   1st Qu.:-74.00  
##  Median :2014-07-17 14:43:00.00   Median :40.74   Median :-73.98  
##  Mean   :2014-07-11 18:49:41.19   Mean   :40.74   Mean   :-73.97  
##  3rd Qu.:2014-08-27 21:55:00.00   3rd Qu.:40.76   3rd Qu.:-73.97  
##  Max.   :2014-09-30 22:59:00.00   Max.   :42.12   Max.   :-72.07  
##                                                                   
##      Base               Time           dayofweek         date        
##  Length:4532116     Length:4532116     Sun:489578   30     : 167101  
##  Class :character   Class :character   Mon:541341   12     : 160532  
##  Mode  :character   Mode  :character   Tue:663651   16     : 158867  
##                                        Wed:696296   13     : 156767  
##                                        Thu:754940   23     : 155958  
##                                        Fri:740778   9      : 155077  
##                                        Sat:645532   (Other):3577814  
##  month           year              hour             minute        second     
##  Apr: 564264   2014:4532116   17     : 336190   10     :  77757   0:4532116  
##  May: 652124                  18     : 324679   14     :  77161              
##  Jun: 663545                  16     : 313400   15     :  77124              
##  Jul: 795732                  19     : 294513   13     :  76957              
##  Aug: 828805                  20     : 284604   12     :  76849              
##  Sep:1027646                  21     : 281460   8      :  76719              
##                               (Other):2697270   (Other):4069549

2. Data Visualization

Total trips by hours in a day

hourly_data <- clean_data %>% 
                group_by(hour) %>% 
                dplyr::summarize(Total = n())
datatable(hourly_data)

ggplot(hourly_data, aes(hour, Total)) + 
geom_bar(stat = "identity",fill = "lightblue") +
ggtitle("Uber - Trips by Hour") +
theme(legend.position = "none", plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = comma)

Total trips by hour and month

month_hour_data <- clean_data %>% 
  group_by(month, hour) %>% 
  dplyr::summarize(Total = n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(month_hour_data, aes(hour, Total, fill = month)) +
        geom_bar(stat = "identity") +
        ggtitle("Uber - Trips by Hour and Month") +
        scale_y_continuous(labels = comma)

Total trips by date of the month

date_data <- clean_data %>% 
  group_by(date) %>% 
  dplyr::summarize(Trips = n())

ggplot(date_data, aes(date, Trips)) +
geom_bar(stat = "identity", fill = "coral") +
ggtitle("Uber - Trips by date of the month") +
theme(legend.position = "none")

scale_y_continuous(labels = comma)

## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1

Total trips by month

month_data <- clean_data %>% 
  group_by(month) %>% 
  dplyr::summarize(Total = n())

ggplot(month_data, aes(month, Total)) +
geom_bar(stat = "identity", fill = "lightcoral") +
ggtitle("Uber - Trips by month") +
theme(legend.position = "none")

scale_y_continuous(labels = comma)

## <ScaleContinuousPosition>
##  Range:  
##  Limits:    0 --    1

Total trips by Bases

ggplot(clean_data, aes(Base)) + 
geom_bar(fill = "darkred") +
scale_y_continuous(labels = comma) +
ggtitle("Total Trips by Bases")

Total trips by Bases and Month

ggplot(clean_data, aes(Base, fill = month)) + 
geom_bar(position = "dodge") +
scale_y_continuous(labels = comma) +
ggtitle("Total Trips by Bases and Month") +
scale_fill_manual(values = colors)

Heatmap Visualization

# Heatmap by Date and Hour
date_hour_data <- clean_data %>% 
  group_by(date, hour) %>% 
  summarize(Total = n())

## `summarise()` has grouped output by 'date'. You can override using the
## `.groups` argument.

ggplot(date_hour_data, aes(date, hour, fill = Total)) +
geom_tile(color = "gray") +
ggtitle("Heat Map by Date and Hour")

# Heatmap by Month and Date
month_date_data <- clean_data %>% 
  group_by(month, date) %>% 
  summarize(Total = n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(month_date_data, aes(date, month, fill = Total)) +
geom_tile(color = "gray") +
ggtitle("Heat Map by Month and Date")

# Heatmap by Month and Day of the Week
month_day_data <- clean_data %>% 
  group_by(month, dayofweek) %>% 
  summarize(Total = n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(month_day_data, aes(dayofweek, month, fill = Total)) +
geom_tile(color = "gray") +
ggtitle("Heat Map by Month and Day of the Week")

# Map Visualization
# Set Map Constants
min_lat <- 40.5774 
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004


ggplot(data, aes(x = Lon, y = Lat)) +
geom_point(size = 1, color = "blue") +
scale_x_continuous(limits = c(min_long, max_long)) +
scale_y_continuous(limits = c(min_lat, max_lat)) +
theme_map() +
ggtitle("NYC Map Based on Uber Rides from Apr-24 to Sep-24")

## Warning: Removed 71701 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Map Visualization
# Set Map Constants
min_lat <- 40.5774 
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004


ggplot(data, aes(x = Lon, y = Lat, color = Base)) +
geom_point(size = 1) +
scale_x_continuous(limits = c(min_long, max_long)) +
scale_y_continuous(limits = c(min_lat, max_lat)) +
theme_map() +
ggtitle("NYC Map Based on Uber Rides from Apr-24 to Sep-24 by BASE")

## Warning: Removed 71701 rows containing missing values or values outside the scale range
## (`geom_point()`).

Uber Data Analysis

Hoang Nguyen