Data Analysis on Uber Rides in New York City

Import the necessary libraries

library(ggplot2)
library(ggthemes)
library(lubridate)

## Loading required package: timechange

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tidyr)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ tibble  3.1.8     ✔ stringr 1.5.0
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ✔ purrr   1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ lubridate::union()       masks base::union()

library(DT)
library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

Creating a vector of colors for the plots

colors = c("#030303", "#BF3EFF","#05a399", "#cfcaca","#B22222","#0683c9","#f83599")

Read the data from each time-frame

library(readr)
apr <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-apr14.csv")
may <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-may14.csv")
jun <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-jun14.csv")
jul <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-jul14.csv")
aug <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-aug14.csv")
sept <- read.csv("~/Downloads/Uber-dataset/uber-raw-data-sep14.csv")

Combine all the files into a single dataframe or dataset

uber_data <- rbind(apr,may,jun,jul,aug,sept)

Return the first 10 rows of the dataset, uber_data

head(uber_data,10)

##           Date.Time     Lat      Lon   Base
## 1  4/1/2014 0:11:00 40.7690 -73.9549 B02512
## 2  4/1/2014 0:17:00 40.7267 -74.0345 B02512
## 3  4/1/2014 0:21:00 40.7316 -73.9873 B02512
## 4  4/1/2014 0:28:00 40.7588 -73.9776 B02512
## 5  4/1/2014 0:33:00 40.7594 -73.9722 B02512
## 6  4/1/2014 0:33:00 40.7383 -74.0403 B02512
## 7  4/1/2014 0:39:00 40.7223 -73.9887 B02512
## 8  4/1/2014 0:45:00 40.7620 -73.9790 B02512
## 9  4/1/2014 0:55:00 40.7524 -73.9960 B02512
## 10 4/1/2014 1:01:00 40.7575 -73.9846 B02512

What are the dimensions of the dataset?

dim(uber_data)

## [1] 4534327       4

What is the class of the date/time?

class(uber_data$Date.Time)

## [1] "character"

Format/Convert the Date.Time into a more readable format.

library(lubridate)
uber_data$Date.Time <- as.POSIXct(uber_data$Date.Time, format = "%m/%d/%Y %H:%M:%S")
uber_data$Time <- format(as.POSIXct(uber_data$Date.Time,format="%m/%d/%Y %H:%M:%S"), format="%H:%M:%S")
uber_data$Date.Time <- ymd_hms(uber_data$Date.Time)

Create individual columns for the year, month, day, and time variables.

uber_data$day <- factor(day(uber_data$Date.Time))
uber_data$month <- factor(month(uber_data$Date.Time, label = TRUE))
uber_data$year <- factor(year(uber_data$Date.Time))
uber_data$dayofweek <- factor(wday(uber_data$Date.Time, label = TRUE))

#Time Variables ~ Hour, Minute, and Seconds

uber_data$hour <- factor(hour(hms(uber_data$Time)))
uber_data$minute <- factor(minute(hms(uber_data$Time)))
uber_data$second <- factor(second(hms(uber_data$Time)))

Let’s see if we got what we wanted

head(uber_data,10)

##              Date.Time     Lat      Lon   Base     Time day month year
## 1  2014-04-01 00:11:00 40.7690 -73.9549 B02512 00:11:00   1   Apr 2014
## 2  2014-04-01 00:17:00 40.7267 -74.0345 B02512 00:17:00   1   Apr 2014
## 3  2014-04-01 00:21:00 40.7316 -73.9873 B02512 00:21:00   1   Apr 2014
## 4  2014-04-01 00:28:00 40.7588 -73.9776 B02512 00:28:00   1   Apr 2014
## 5  2014-04-01 00:33:00 40.7594 -73.9722 B02512 00:33:00   1   Apr 2014
## 6  2014-04-01 00:33:00 40.7383 -74.0403 B02512 00:33:00   1   Apr 2014
## 7  2014-04-01 00:39:00 40.7223 -73.9887 B02512 00:39:00   1   Apr 2014
## 8  2014-04-01 00:45:00 40.7620 -73.9790 B02512 00:45:00   1   Apr 2014
## 9  2014-04-01 00:55:00 40.7524 -73.9960 B02512 00:55:00   1   Apr 2014
## 10 2014-04-01 01:01:00 40.7575 -73.9846 B02512 01:01:00   1   Apr 2014
##    dayofweek hour minute second
## 1        Tue    0     11      0
## 2        Tue    0     17      0
## 3        Tue    0     21      0
## 4        Tue    0     28      0
## 5        Tue    0     33      0
## 6        Tue    0     33      0
## 7        Tue    0     39      0
## 8        Tue    0     45      0
## 9        Tue    0     55      0
## 10       Tue    1      1      0

Data Visualization

Plot the number of trips made by passengers in a day by hour

dataUber <- uber_data
hourData <- dataUber %>% group_by(hour) %>%
  dplyr::summarize(Total = n())
ggplot(hourData,aes(hour,Total)) + geom_bar(stat = "identity", fill = "purple",color ="black") + ggtitle("Trips Per Hour") + theme(legend.position = "none") + scale_y_continuous(labels = comma) + xlab("Hour") + ylab("Total Trips") + theme_light()

Plot the number of trips made by passengers in a day by hour and month

monthHour <- dataUber %>% group_by(month,hour) %>%
  dplyr::summarize(Total=n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(monthHour,aes(hour,Total, fill = month)) + geom_bar(stat = "identity") + ggtitle("Trips Per Hour and Month") + theme(legend.position = "none") + scale_y_continuous(labels = comma) + xlab("Hour") + theme_light()

Plot the number of trips per day of the month

#Aggregate the data by the month
dayGroup <- dataUber %>% group_by(day) %>%
  dplyr::summarize(Total=n())
head(dayGroup,10)

## # A tibble: 10 × 2
##    day    Total
##    <fct>  <int>
##  1 1     127430
##  2 2     143201
##  3 3     142983
##  4 4     140923
##  5 5     147054
##  6 6     139886
##  7 7     143503
##  8 8     145984
##  9 9     155135
## 10 10    152500

#Plot the data for the day
ggplot(dayGroup,aes(day,Total))+geom_bar(stat = "identity",fill="mediumvioletred",color="white")+ggtitle("Trips Every Day")+theme(legend.position = "none")+scale_y_continuous(labels = comma)+theme_light()+xlab("Day") + ylab("Total Trips")

#collect data by day of the week and month
day_month_data <- dataUber %>%
  group_by(month,day,dayofweek) %>%
  dplyr::summarize(Total=n())

## `summarise()` has grouped output by 'month', 'day'. You can override using the
## `.groups` argument.

ggplot(day_month_data,aes(month,Total,fill=dayofweek))+
  geom_bar(stat = "Identity", position = "dodge")+
  ggtitle("Trips By Day And Month")+
  scale_y_continuous(labels = comma)+
  scale_fill_manual(values = colors) + xlab("Month") + ylab("Total Trips")+
  theme_light() + labs(fill = "Day of the Week")

Plot the number of trips taking place during months in a year

monthGroup <- dataUber %>% group_by(month) %>%
  dplyr::summarize(Total = n())
monthGroup

## # A tibble: 6 × 2
##   month   Total
##   <ord>   <int>
## 1 Apr    564516
## 2 May    652435
## 3 Jun    663844
## 4 Jul    796121
## 5 Aug    829275
## 6 Sep   1028136

ggplot(monthGroup, aes(month, Total, fill = month)) + 
geom_bar(stat = "Identity") + 
ggtitle("Trips Per Month") + 
theme(legend.position = "none") + 
scale_y_continuous(labels = comma) + 
scale_fill_manual(values = colors) + xlab("Months") + ylab("Number of Trips")

Plot the number of trips by days per month in a year

monthWeekday <- dataUber %>%
  group_by(month,dayofweek)%>%
  summarize(Total=n())

## `summarise()` has grouped output by 'month'. You can override using the
## `.groups` argument.

ggplot(monthWeekday,aes(month,Total,fill=dayofweek))+
  geom_bar(stat = "identity",position = "dodge")+
  ggtitle("Trips By Day And Month")+
  scale_y_continuous(labels = comma)+
  scale_fill_manual(values = colors)+xlab("Month")+ylab("Total Trips")+
  theme_light() + labs(fill ="Day of the Week")

Plot the number of trips by each base

ggplot(dataUber,aes(Base)) + geom_bar(fill="aquamarine1")+
  scale_y_continuous(labels = comma)+
  ggtitle("Trips By Bases")+ylab("Trips")+
  theme_light()

Plot the number of trips by each base and month

ggplot(dataUber,aes(Base,fill=month))+
  geom_bar(position = "dodge")+
  scale_y_continuous(labels = comma)+
  ggtitle("Trips By Bases And Month")+
  scale_fill_manual(values = colors)+ylab("Trips")+
  theme_light()

Plot the number of trips by each base and day of the week

ggplot(dataUber,aes(Base,fill=dayofweek))+
  geom_bar(position = "dodge")+
  scale_y_continuous(labels = comma)+
  ggtitle("Trips By Bases And Day of the Week")+
  scale_fill_manual(values = colors)+ylab("Trips")+
  theme_light() + labs(fill = "Day of the Week")

Create a heat map visualization of day, hour, and month

day_hour_data <- uber_data %>%
  group_by(day,hour) %>%
  dplyr::summarize(Total = n())

## `summarise()` has grouped output by 'day'. You can override using the `.groups`
## argument.

#Plot a Heat Map by Hour and Day
ggplot(day_hour_data,aes(day,hour,fill = Total))+
  geom_tile(color = "yellow")+
  ggtitle("Heat Map by Hour and Day") + ylab("Hour") + xlab("Day") + scale_fill_gradient(low="#ff5e62",high="#ff9966" ,guide="colorbar")

#Plot a Heat Map by Month and Day
ggplot(day_month_data,aes(day,month,fill = Total)) + 
  geom_tile(color = "yellow")+
  ggtitle("Heat Map by Month and Day") + ylab("Month") + xlab("Day") + scale_fill_gradient(low = "#185a9d",high = "#43cea2",guide = "colorbar")

#Plot a Heat Map by Month and Week Day
ggplot(monthWeekday, aes(dayofweek,month,fill = Total))+ geom_tile(color="white")+
  ggtitle("Heat Map By Month And Day Of Week") + ylab("Month") + xlab("Day of the Week") + scale_fill_gradient(low = "#753a88",high = "#cc2b5e",guide = "colorbar")

#Plot a Heat Map by Month and Base
monthBase <- dataUber%>%
  group_by(Base,month)%>%
  summarize(Total=n())

## `summarise()` has grouped output by 'Base'. You can override using the
## `.groups` argument.

ggplot(monthBase,aes(Base,month,fill=Total))+
  geom_tile(color="white")+
  ggtitle("Heat Map By Month And Bases")+ylab("Month")+
  theme_light() + xlab("Base")

#Plot a Heat Map by Day of the Week and Bases
dayofweekBase <- dataUber %>%
  group_by(Base,dayofweek) %>%
  summarize(Total=n())

## `summarise()` has grouped output by 'Base'. You can override using the
## `.groups` argument.

ggplot(dayofweekBase,aes(Base,dayofweek,fill = Total)) + 
  geom_tile(color="white")+
  ggtitle("Heat Map By Day of the Week And Bases")+ylab("Day of the Week")+
  theme_light() + xlab("Base") + scale_fill_gradient(low = "#19547b",high = "#ffd89b",guide = "colorbar")

Create a Map Visualization of Rides in New York

#Set map constants

minLat <- 40.5774
maxLat <- 40.9176
minLong <- -74.15
maxLong <- -73.7004

uber <- na.omit(dataUber)
ggplot(uber,aes(Lon,Lat))+
  geom_point(size=1,color="blue")+
  scale_x_continuous(limits = c(minLong,maxLong))+
  scale_y_continuous(limits = c(minLat,maxLat))+
  theme_map()+
  ggtitle("NYC Map Based On Uber Rides During 2014 (APR - SEPT)")

## Warning: Removed 71701 rows containing missing values (`geom_point()`).

ggplot(dataUber,aes(x=Lon,y=Lat,color=Base))+
  geom_point(size=1)+
  scale_x_continuous(limits = c(minLong,maxLong))+
  scale_y_continuous(limits = c(minLat,maxLat))+
  theme_map()+
  ggtitle("NYC Map Based On Uber Rides During 2014 By Base (APR - SEPT)")

## Warning: Removed 71701 rows containing missing values (`geom_point()`).