########################################################################################
########################################################################################
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(stringr)
#install.packages("gridExtra")
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#setwd("C:/Users/WL0681/OD/IIIT BA Course/C2_Statistics and Data Visualization/M4_Uber Supply_Demand Gap_Assignment")
uber_data = read.csv("Uber Request Data.csv")
str(uber_data)
## 'data.frame': 6745 obs. of 6 variables:
## $ Request.id : int 619 867 1807 2532 3112 3879 4270 5510 6248 267 ...
## $ Pickup.point : Factor w/ 2 levels "Airport","City": 1 1 2 1 2 1 1 1 2 2 ...
## $ Driver.id : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Status : Factor w/ 3 levels "Cancelled","No Cars Available",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ Request.timestamp: Factor w/ 5618 levels "11/7/2016 0:00",..: 81 266 1563 1252 2023 2790 3162 4389 5108 670 ...
## $ Drop.timestamp : Factor w/ 2598 levels "11/7/2016 0:51",..: 79 201 923 754 1125 1439 1599 2097 2400 412 ...
#View(uber_data)
# Request ID and Driver.ID Needs to be unique for a request and driver respectively so we need to change int to "factor" data type
#uber_data$Request.id = factor(uber_data$Request.id)
#uber_data$Driver.id = factor(uber_data$Driver.id)
#######################################################################################
########..........DATA CLEANING & PREPARAITION..........#####################
# Standardizing date fomat separator form "/" to "-" and removing seconds as this granular level analysis is not required
uber_data$Request.timestamp = str_replace_all(uber_data$Request.timestamp,"\\/","-")
uber_data$Drop.timestamp = str_replace_all(uber_data$Drop.timestamp,"\\/","-")
# converting into standard R date-time format
uber_data$Request.timestamp = as.POSIXct(uber_data$Request.timestamp, format = "%d-%m-%Y %H:%M")
uber_data$Drop.timestamp = as.POSIXct(uber_data$Drop.timestamp, format = "%d-%m-%Y %H:%M")
# creating separatel columns from the date & time: day, month, year, hours and minutes
uber_data$request_day = format(uber_data$Request.timestamp, "%d")
uber_data$request_month = format(uber_data$Request.timestamp, "%m")
uber_data$request_year = format(uber_data$Request.timestamp, "%Y")
uber_data$request_hrs = format(uber_data$Request.timestamp, "%H")
uber_data$request_min = format(uber_data$Request.timestamp, "%M")
# creating drop hours and drop minutes from the drop timestamp
uber_data$drop_hrs = format(uber_data$Drop.timestamp, "%H")
uber_data$drop_min = format(uber_data$Drop.timestamp,"%M")
uber_data$journey_minits = uber_data$Drop.timestamp-uber_data$Request.timestamp
uber_data$is_peakhr = factor(ifelse(uber_data$request_hrs>="05" &
uber_data$request_hrs <="10",
"peakhr_morning",
ifelse(uber_data$request_hrs>="17" &
uber_data$request_hrs <="22",
"peakhr_evening","Not_peakhr")))
uber_data_backup =uber_data
#View(uber_data_backup)
#write.csv(uber_data_backup,"uber_data_tableau.csv")
#########################################################################################
###########.......UNIVARIATE ANALYSIS................##########################
# We need to find any repetitions in the dataset for each column. It seems same drivers have been doing multiple trips.
apply(uber_data,2,function(x) sum(duplicated(x)))
## Request.id Pickup.point Driver.id Status
## 0 6743 6444 6742
## Request.timestamp Drop.timestamp request_day request_month
## 2729 4462 6740 6744
## request_year request_hrs request_min drop_hrs
## 6744 6721 6685 6720
## drop_min journey_minits is_peakhr
## 6684 6681 6742
# There are pickup/airport from and to the city
# 3 levels - no cars available, cancelled, trip completed
# each request is unique and this data is of July 2016
apply(uber_data,2,function(x) length(unique(x)))
## Request.id Pickup.point Driver.id Status
## 6745 2 301 3
## Request.timestamp Drop.timestamp request_day request_month
## 4016 2283 5 1
## request_year request_hrs request_min drop_hrs
## 1 24 60 25
## drop_min journey_minits is_peakhr
## 61 64 3
nrow(uber_data)
## [1] 6745
############.......... Uber Bivariate & Segmented variate Analysis..........###########
# hint: Create plots to visualize the frequency of requests that get cancelled or
#show 'no cars available'; identify the most problematic types of requests
#(city to airport / airport to city etc.) and the time slots (early mornings, late evenings etc.)
# Trip cancelled or no cars availability is high compared to trips completed. Uber is loosing revenue
# and >60% of the time customer is unable to obtain service from Uber due to this
ggplot(data = uber_data, mapping = aes(x = Status)) + geom_histogram(stat = "count") +
labs(title ="Frequency of Status of Cab request", subtitle = "Frequency of Booking Status", x= "Booking reqst in a day (hrs)")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

#ggsave("status_plot.pdf")
########..... (Refer Plot 1 in PPT) Frequency based on TRIP Status.......######
# From Airport to city there are very less cancellations. But MAJOR PROBLEM is CARS UNAVAILABLE at Airport for pickup
# City to Airport: Lot of cancellations? Why? But other hand trips completed to aiport is high
ggplot(data = uber_data,
mapping = aes(x = uber_data$Status, fill = Pickup.point )) +
geom_histogram(position = "stack", stat = "count") +
theme(title = element_text(size=9, face="bold"))+
labs(title ="Frequency of Status of Cab request",
subtitle = "Summary: About 60% of the requests customer is unable to obtain uber service due to cancellation/cab unvailability",
x= "Booking reqst 'Status' in a day")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# ANALYSIS: Plot 1-
# #Cancellations and cab unavailability are persistent in this case. About 60% of times
# Airport to city or city to Airport cab request were not catered due to either unavailability or
# cancellations of the cab booking.
# Also Cab unavailability is high for Airport pickup requests and Cancellations are high for
# City pickup requests
# Demand is high During 5 AM-10AM and 5PM-10PM during every day are peak hours
ggplot(data = uber_data, mapping = aes(x = request_hrs)) + geom_bar() +
labs(title ="Cab Demand/Request Pattern During the day", subtitle = "Demand is high from 5-10AM & 5-10PM",
x= "Booking reqst in a day (hrs)", y ="Cab demand pattern by hour")

#ggsave("demand cabs.pdf")
# CANCELLATIONS are high in the morning peak hours compared to Evening peak hours.
# During evening peak hours cab unavailability is high in the evening.
# Overall, Uber is loosing business
rhrs_status =
ggplot(data = uber_data, mapping = aes(x = request_hrs, fill = Status)) +
geom_bar() + theme(title = element_text(size=9, face="bold")) +
labs(title ="Cab Demand/Request Pattern During the day",
subtitle = "Peak hrs 5-10AM: High Cancellations, Peak hrs 5-10PM: High Cab unavailability ",
x= "Booking reqst in a day (hrs)")
rhrs_status

#ggsave("CabDemand_status.pdf")
rhrs_status_pickuppoint = rhrs_status + facet_wrap(~Pickup.point)
rhrs_pickupPoint =
ggplot(data = uber_data, mapping = aes(x = request_hrs, fill = Pickup.point)) +
geom_bar() + theme(title = element_text(size=9, face="bold")) +
labs(title ="Cab Demand/Request Pattern by Pickup Point",
subtitle = "5-10AM: High Demand at City Pickup (to Airport); 5-10PM: High Demand at Airport (to city) ",
x= "Booking reqst in a day (hrs)", caption ="Considering demand 5-10AM & 5-10PM are Problamatic") +
annotate("text", x= 11, y= 500, col = "blue",label = "During 5-10 AM peak hours requests are high from city and 5-10PM requests/demand is high from Airport")
rhrs_pickupPoint

#ggsave("demandPattern_Pickuppoint.pdf")
grid.arrange(rhrs_status,rhrs_pickupPoint, ncol =2)

# Analaysis:
# During morning peak hours 5AM- 10AM city to Airport request are more
# During evening peak hours 5PM- 9PM Airport to city request are more
# Driver trip cancelled rate is high in the morning peak hours
# where as, cabs unavailability is high in the evening peak hours
#..... How journey_minits (Duration of journey)impacting overall business.....
jm_hrs_pickup = ggplot(data = uber_data, mapping = aes(y = journey_minits, x = request_hrs, fill = Pickup.point)) +
geom_boxplot() + theme(title = element_text(size=9, face="bold"))+
labs(title ="Journey Duration pattern in a weekday ",
subtitle = "Summary: Journey duration is relatively high to Airport in the morning & high to city in evening",
x= "Booking reqstd during the day (Hrs)", y = "Journey Duration")
jm_hrs_pickup
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 3914 rows containing non-finite values (stat_boxplot).

mean(uber_data$journey_minits,na.rm = TRUE)
## Time difference of 52.41152 mins
#grid.arrange(rhrs_pickupPoint,rhrs_status,jm_hrs_pickup, ncol= 2,nrow =2)
#ggsave("demand gap.pdf")
# (Refer Plot 2 in PPT) Problematic Area for Uber
grid.arrange(rhrs_status_pickuppoint,jm_hrs_pickup, ncol= 1,nrow =2)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 3914 rows containing non-finite values (stat_boxplot).

#ggsave("plot.pdf")
# ANALYSIS: Plot 2 -
# From top plot??? Peak hours are 05-10AM in the morning from the city . Peak hours
# are 05 PM-10 PM in the evening from airport.
# From bottom plot??? At high level the time taken to Airport travel is high in the
# morning peak hours.( Red color box area in the plot)
# So to mitigate supply demand gap Uber needs to have primary focus on peak hours
# i.e. 05AM-10 AM and 05-PM-10PM time slots. Also should focus on cab cancellations in the morning (City) and Cabs unavailability in the evening peak hours slots.
##########........What is demand & supply in Uber case .?.........########
#1. Demand = number of cab requests done. i.e. requests done while
#Trip completed + Trip cancelled + Cabs unavailable
#2. Supply = Number of cab availability. Only trip completed is considered as supply.
# Trip cancelled / cab unavailability ideally did not cater customer so did not consider
# as part of supply
###########################################################################################
# Why driver trip cancellation rate is high in the morning peak hours
# Ans: The median time taken to airport during morning peak hours for each trip to airport is higher
# compared with rest of the day.
# also journey time to city to airport is higher.
# Why cabs unavailablity is high in the evening peak hours
# Question: Why cab supply is less at Airport in the evening where peak demand exists from 5PM-9PM
# For each driver who is at the aiport what is the waiting time vs. driver from the city to airport
# Find the time slots when the highest gap exists
# 5AM-9AM and 5PM-10PM are the has highest gap exists in demand vs. supply
#Question:
# Find the types of requests (city-airport or airport-city) for which the gap is the most severe
#in the identified time slots
# Answer:
# From above plot we can handle the cab cancellations (5AM-10AM) with some additional incentives to drivers
# so the driver takes up airport drop.
# Cab unavilability is during (5PM-10PM) in the aiport during this peak period is challenging and
# Uber needs to supply more cabs some how and increase cab availability. Please note that from Airport
# cab cancellations are very less and not a big challenge
############################################################################################
#####.... Finding Cab booking requsts (demand) Vs. Cabs availability (Supply) Gap during
############ ..... PEAK HOURS (Morning & Evening) ..... ###############
##########......... Cab Supply Demand Gap (Refer Plot 3 in PPT)......############
# Assumption /Note: Only PEAK hours data for the slots 05-10AM & 05-10 PM has been
#considered and extracted from given Uber data set for analysis
# Extracting peak hours data from uber data to peakhr_data dataframe
peakhr_data = uber_data %>% filter(request_hrs %in% c("05","06","07","08","09","10",
"17","18","19","20","21","22"))
peakhr_data$is_peakhr = factor(ifelse(peakhr_data$request_hrs>="05" &
peakhr_data$request_hrs<="10",
"peakhr_morning","peakhr_evening"))
#View(peakhr_data)
# Peakhr_data dervied data frame considered for cab demand calculations
# drivers_trips derived data frame considered for supply demand calculations
# for this data frame we need to subset peak hours data
# demand for cabs during peak hours
# demand is taken from peakhr_data data frame where only peak hours data available
aggregate(Request.id~Pickup.point,peakhr_data,length)
## Pickup.point Request.id
## 1 Airport 2484
## 2 City 2508
peakhr_cab_demand = aggregate(Request.id~Pickup.point+Status,peakhr_data,length)
names(peakhr_cab_demand) = c("Pickup.point","Status", "cab_demand")
peakhr_cab_demand
## Pickup.point Status cab_demand
## 1 Airport Cancelled 141
## 2 City Cancelled 942
## 3 Airport No Cars Available 1455
## 4 City No Cars Available 546
## 5 Airport Trip Completed 888
## 6 City Trip Completed 1020
# Cab supply during peak hours
# cab supply is taken from driver_trips (has trip completed data) data frame where
#only trip completed data available
# obtaining only peak hours data from driver_trip to find supply
driver_trips = uber_data %>% filter(!is.na(Driver.id) & Status =="Trip Completed") %>%
group_by(Driver.id)
trips_completed_peakhr =
driver_trips %>%
filter(request_hrs %in% c("05","06","07","08","09","10",
"17","18","19","20","21","22"))
#View(trips_completed_peakhr)
aggregate(Driver.id~Pickup.point,trips_completed_peakhr,length)
## Pickup.point Driver.id
## 1 Airport 888
## 2 City 1020
peakhr_cab_supply = aggregate(Driver.id~Pickup.point + Status,trips_completed_peakhr,length)
names(peakhr_cab_supply) = c("Pickup.point","Status","cab_supply")
peakhr_cab_supply
## Pickup.point Status cab_supply
## 1 Airport Trip Completed 888
## 2 City Trip Completed 1020
#table(peakhr_cab_demand,peakhr_cab_supply)
peakhr_supply_demand= merge(peakhr_cab_demand,peakhr_cab_supply,all.x = TRUE)
#peakhr_supply_demand$Status = NULL
peakhr_supply_demand$cab_supply[which(is.na(peakhr_supply_demand$cab_supply))] =0
peakhr_supply_demand
## Pickup.point Status cab_demand cab_supply
## 1 Airport Cancelled 141 0
## 2 Airport No Cars Available 1455 0
## 3 Airport Trip Completed 888 888
## 4 City Cancelled 942 0
## 5 City No Cars Available 546 0
## 6 City Trip Completed 1020 1020
# aggregate(Pickup.point~cab_demand,cab_supply,peakhr_supply_demand,sum)
# peakhr_supply_demand %>% group_by(Pickup.point,Status)
# Plotting cab demand and cab supply for finding out supply demand gap
peakhr_cab_demand_plot = ggplot(data = peakhr_supply_demand,
aes(x = Pickup.point,y = cab_demand, fill = Status )) +
geom_bar(stat = "identity", position = "stack") +
theme(title = element_text(size=9, face="bold"))+
labs(title = "Cab Demand from Pickup Points during Peak hours",
subtitle ="Summary: Cab demand during peak hours based on Pickup Point and Status",
caption ="source: peakhr_data data frame subsetted for peak hours" ,
x = "Pickup.point", y = "Cab Demand (no of cab reqsts)")
peakhr_cab_demand_plot

peakhr_cab_supply_plot = ggplot(data = peakhr_supply_demand,
aes(x = Pickup.point, y = cab_supply, fill = Status )) +
geom_bar(stat = "identity") +
theme(title = element_text(size=9, face="bold"))+
labs(title = "Cab Supply from Pickup Points during Peak hours",
subtitle ="Summary: Cab supply during peak hours based on Pickup Point and trips completed",
caption ="source: driver_trips data frame subsetted for peak hours" ,
x = "Pickup.point", y = "cab supply (no of cab reqsts)") +
coord_cartesian(ylim = c(0,2500))
peakhr_cab_supply_plot

grid.arrange(peakhr_cab_demand_plot, peakhr_cab_supply_plot, ncol =2)

# ANALYSIS: Plot 3 -
# Case 1:
#Morning Peak hours (05-10 AM) City Pickups: Cabs demand (requests) is high for
#city pickups (city to airport) and high cancellations (refer plot 2) Uber was able to
#cater to 41% of the total requests during morning peak hours
# Cab demand & Supply from city:
#Cab supply is only ~1,020 and cab demand is 2,508. Hence Supply demand gap exists in
#peak hour morning from city pickups but lower than evening peak hours
#Case 2:
#Evening Peak Hours (05-10 PM) Airport Pickups: Cabs demand (requests) is high for
#Airport pickups (Airport to city). Cab unavailability is high (refer plot 2) and demand
#supply gap from airport & Peakhr_evening is evident from above plot. Only 36% of total requests
#from Airport were catered during evening peak hours. Hence, Evening from Airport supply demand
#gap is high during peak hours
# cab demand & supply from Airport:
#Cab supply is only ~888 and cab demand is 2,484. Hence Supply demand gap exists in peak hour
#evening from Airport pickup
### Demand Supply gap for entire data set - plot 3.1 in PPT###########
########......Cab Supply Demand Gap- Entire Day (plot 3.1 in PPT).....#########
# demand for cabs during entire day ........
# demand is taken from uber_data data frame where entire cab requests data available
aggregate(Request.id~Pickup.point,uber_data,length)
## Pickup.point Request.id
## 1 Airport 3238
## 2 City 3507
overall_cab_demand = aggregate(Request.id~Pickup.point+Status,uber_data,length)
names(overall_cab_demand) = c("Pickup.point","Status", "overall_cab_demand")
overall_cab_demand
## Pickup.point Status overall_cab_demand
## 1 Airport Cancelled 198
## 2 City Cancelled 1066
## 3 Airport No Cars Available 1713
## 4 City No Cars Available 937
## 5 Airport Trip Completed 1327
## 6 City Trip Completed 1504
# Cab supply during peak hours
# cab supply is taken from uber_date (has trip completed data) data frame where
#only trip completed data available
# obtaining only trip completed data from uber_data to find supply
trips_completed_entireday =
uber_data %>% filter(Status == "Trip Completed")
#View(trips_completed_entireday)
aggregate(Driver.id~Pickup.point,trips_completed_entireday,length)
## Pickup.point Driver.id
## 1 Airport 1327
## 2 City 1504
overall_cab_supply = aggregate(Driver.id~Pickup.point + Status,trips_completed_entireday,length)
names(overall_cab_supply) = c("Pickup.point","Status","overall_cab_supply")
overall_cab_supply
## Pickup.point Status overall_cab_supply
## 1 Airport Trip Completed 1327
## 2 City Trip Completed 1504
overall_supply_demand= merge(overall_cab_demand,overall_cab_supply,all.x = TRUE)
overall_supply_demand
## Pickup.point Status overall_cab_demand overall_cab_supply
## 1 Airport Cancelled 198 NA
## 2 Airport No Cars Available 1713 NA
## 3 Airport Trip Completed 1327 1327
## 4 City Cancelled 1066 NA
## 5 City No Cars Available 937 NA
## 6 City Trip Completed 1504 1504
overall_supply_demand$overall_cab_supply[which(is.na(overall_supply_demand$overall_cab_supply))] =0
overall_supply_demand
## Pickup.point Status overall_cab_demand overall_cab_supply
## 1 Airport Cancelled 198 0
## 2 Airport No Cars Available 1713 0
## 3 Airport Trip Completed 1327 1327
## 4 City Cancelled 1066 0
## 5 City No Cars Available 937 0
## 6 City Trip Completed 1504 1504
# Plotting overall cab demand and supply gap
overall_cab_demand_plot = ggplot(data = overall_supply_demand,
aes(x = Pickup.point,y = overall_cab_demand, fill = Status )) +
geom_bar(stat = "identity", position = "stack") +
theme(title = element_text(size=9, face="bold"))+
labs(title = "Cab Demand of entire day from Pickup Points",
subtitle ="Summary: Entire day cab demand is based on Pickup Point and Status",
caption ="source: uber_data data frame subsetted for peak hours" ,
x = "Pickup.point", y = "overall Cab Demand (No. of cab reqsts for entire day)")
overall_cab_demand_plot

overall_cab_supply_plot = ggplot(data = overall_supply_demand,
aes(x = Pickup.point, y = overall_cab_supply, fill = Status )) +
geom_bar(stat = "identity") +
theme(title = element_text(size=9, face="bold"))+
labs(title = "Cab Supply for entire from Pickup Points",
subtitle ="Summary: Entire day Cab supply is based on Pickup Point and trips completed",
caption ="source: uber_data data frame subsetted for Trip completed" ,
x = "Pickup.point", y = "overall cab supply (No.of cab reqsts for entire day") +
coord_cartesian(ylim = c(0,3500))
overall_cab_supply_plot

grid.arrange(overall_cab_demand_plot, overall_cab_supply_plot, ncol =2)

# ANALYSIS: (plot 3.1 from PPT)
#Case 1 : City Pickups: Uber was able to cater to 42% of the customer requests during entire day.
#During peak hours (from plot 3) Uber was able to cater to 41% of the customer requests
# cab demand and supply from city:
# Cab supply is only ~1,504 and cab demand is 3,507. Hence Supply demand gap exists for
#city pick ups as well
#Case 2: Airport Pickups: Uber was able to cater to 40% of the customer requests during entire
#day for the data given period i.e. 11th to 15th July 2016. During peak hours only 36% of the
#request are catered.
# cab demand and supply from airport:
# Cab supply is only ~1,327 and cab demand is 3,238. Hence Supply demand gap exists from
#Airport pickups
# Conclusion: Demand and supply gap exists for city and airport pick ups during entire day of
#operation. However, during peak hours evening Airport pickups are challenging and
#supply demand gap is acute.
##############################################################################################
#########... TIME SPENT AT AIRPORT or in CIty WITHOUT A TRIP (IDLE TIME AT AIRPORT/City)...##########
#######....Refer plot 4 in PPT: Wating time patterns at Airport .......######
#KEY ASSUMPTION:
# only city to airport and airport to city data available. It was assumed that driver only
# does Airport/city pickups which is not true in real world. In between each Airport/city pickup
# there will be near by trips.However, as this data not available we are assuming drivers do not
# take any near by trips and only tool Airport to City or city to airport trips
driver_trips = data.frame(driver_trips)
trip_count = data.frame(table(driver_trips$Driver.id))[,2]
#View(driver_trips)
# declaring empty vectors for catching variables inside user defined function idle_time
waittime = vector()
d_id = vector()
temp = vector()
rownum = vector()
index =0
#Idle_time_airport = data.frame(idle_time(driver_trips))
# Derving idle time based on pickup and drop point
idle_time = function(dt, pickup_point,drop_point) {
for (i in 1:length(trip_count)) {
for(j in 1:trip_count[i]) {
k = sum(index,j)
if(dt$Pickup.point[k] == pickup_point & dt$Pickup.point[k+1] == drop_point) {
#print(k)
timediff = difftime(dt$Request.timestamp[k+1], dt$Drop.timestamp[k], units = "hours")
idletime_mins = str_extract_all(timediff,"\\d+", simplify = TRUE)[,1]
#print(idletime_mins)
waittime[length(waittime)+1] = idletime_mins
d_id[length(d_id)+1] = i
rownum[length(rownum)+1]= index+j
}
}
temp[i] = trip_count[i]
index = sum(temp)
}
dat_frame = data.frame(rownum, d_id,waittime)
#write.csv(dat_frame, file = "mywork.csv")
return(dat_frame)
}
# function call for Identifying idle time at the airport
idle_time_airport = idle_time(driver_trips,"City","Airport")
#idle_time_airport
#Assumption: We have > 14 hours of waiting time at the airport which is not recommended/ have very less probability
# so removing waiting hours which are >14hrs considering as outliers
idle_time_airport$waittime = as.numeric(as.character(idle_time_airport$waittime))
# ASSUMPTION: Max. Waiting time taken as 14 Hrs
idle_time_airport = idle_time_airport[which(idle_time_airport$waittime <= 14),]
#write.csv(idle_time_airport,"idletime_airport.csv")
# obtaining aggregation on Driver.ID to find out avg wait time
mean_waittime_airport = round(aggregate(waittime~d_id, idle_time_airport,mean),1)
median(idle_time_airport$waittime)
## [1] 5
# Refer Plot 4: Waiting time patterns at Airport
# plotting waiting time at airport distribtuion for each driver
airport_idletime_plot = ggplot(data = idle_time_airport, aes(x = waittime, y = d_id, group = waittime )) + geom_boxplot() +
geom_point(aes(x=mean(waittime)), col = "red") +
theme(title = element_text(size=9, face="bold")) +
labs(title = "Each Driver waiting time pattern at Airpot with Mean waiting time",
subtitle ="Summary: Each Driver's waiting time at Airport is very irregular ranging from 0HRs to 14HRs",
x = "Waiting time at Airport in Hrs",y ="Driver ID/Each Driver")
airport_idletime_plot

########################################################################################
#####---- Identify what time of the day waiting time is more------#######
##### Waiting Time Patterns at Airport (Refer Plot 4 in PPT)
# Assumptions / Note:
# Waiting time in the Airport in plot 1 arrived based on
# 1. The difference between drop time (drop time stamp) of a city pick up and
# pick up(request.timestamp) time of city pickup
# 2. After a city to Airport trip is completed driver waited at the airport for
# the next pick up to the city
# 3. Maximum wait time of <=14 hours at the airport has been assumed. The chances
# of getting a trip is less if he is unable to get a trip during peak hours.
# E.g. driver dropped at 6 AM if he is not getting a trip till 8 PM of the same day probability
# of getting an trip from Airport to city is marginal
rowindex = idle_time_airport$rownum
#Choosing the data/rows from driver_trips (contains all trip completed data) based on row numbers
#extracted from idle_time_airport data frame (contains idel time for each driver
#who did Airport pickup and concered row numbers)
df_waittime_airport = driver_trips[rowindex,]
df_waittime_airport$idle_time = idle_time_airport$waittime
df_waittime_airport$is_peakhr = factor(ifelse(df_waittime_airport$request_hrs>="05" &
df_waittime_airport$request_hrs <="10",
"peakhr_morning",
ifelse(df_waittime_airport$request_hrs>="17" &
df_waittime_airport$request_hrs <="22",
"peakhr_evening","Not_peakhr")))
#View(df_waittime_airport)
peakhr_wait_plot = ggplot(df_waittime_airport, aes(x = df_waittime_airport$request_hrs,
y = df_waittime_airport$idle_time, fill = is_peakhr)) +
geom_boxplot() + theme(title = element_text(size=9, face="bold")) +
labs(title = "Airport Waiting time pattern during the day",
subtitle ="Summary: Median wating time for new booking is relatively high during evening peak hours",
x = "Cab Booking Reqsted during day (hrs)",y ="Waiting time at Ariport for booking(Hrs)")
grid.arrange(airport_idletime_plot,peakhr_wait_plot, ncol = 2)

#ANALYSIS: (Refer Plot 4 in PPT): Waiting time patterns at Airport
# From the left plot ??? The waiting time of each of the driver is very irregular varying
# from 0hrs to 15hrs.
# From right plot ??? Median waiting time during peak evening hours is relatively high
# and time variation in box plot is high during pea hours. These reasons and uncertainty around wait time might be discouraging drivers to take up trips to and from airport
# Driver had to wait more than 5 hours in the airport for the booking also during peak hours
#########################################################################################
#####........ for completed trips, Journey time during peak hours.......#######
############ refer plot 5 in PPT: Journey Duration vs. Supply Gap ###################
# ASSUMPTION / NOTE:
#Journey Duration is the time difference in minutes between pick up and drop time
#of a specific trip/request id
driver_trips$is_peakhr = factor(ifelse(driver_trips$request_hrs>="05" &
driver_trips$request_hrs <="10",
"peakhr_morning",
ifelse(driver_trips$request_hrs>="17" &
driver_trips$request_hrs <="22",
"peakhr_evening","Not_peakhr")))
##### ... Please refer Plot 5 in ppt
ggplot(driver_trips, mapping = aes(x= request_hrs, y = journey_minits, fill = is_peakhr)) +
geom_boxplot() + facet_wrap(~Pickup.point) +
theme(title = element_text(size=9, face="bold")) +
labs(title = "Journey Duration Airport vs. City Pickups",
subtitle ="Summary: During peak hours Journey duration is same for both Airport and city pickups",
x = "Cab Booking hours in a day",y ="Journey Duraion in (minutes)")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

# ggplot(driver_trips, aes(x= is_peakhr, y= journey_minits)) + geom_boxplot()
# ggplot(driver_trips, aes(x= Pickup.point, y= journey_minits)) + geom_boxplot()
#ANALYSIS: (refer plot 5 in PPT:Journey Duration vs. Supply Gap)
#From the plot below the journey duration during peak morning is relatively high compared
#to peak evening hours. Hence, it is most likely that chances of driver cancelling/not
#interested to go to airport is high due to high journey duration
#########################################################################################
######...... Trip cancelled by Driver.....##################
########......Refer PLOT 6 in PPT: TRIP Cancellation ........###########
# Assumptions/Note:
#Trip Cancellation ??? When driver cancels a pickup request from the customer.
#This case all the data in the given dataset is for trip cancelled status except drop time of a trip which will not be available as trip got cancelled.
#From the given Uber data set, we extracted data/rows related TRIP CANCELLATIONS status
#only and analyzed patterns on this data. Hence, data considered for plots in this slide
#consists of TRIP cancellation data only
# Testing hypothesis, the cab cancellation trend during morning peak hours holds good by
#analyzing only peak hour data during all days.
# filtering Status "Trip Cancelled" rows from uber_data to further analysis
cancelled_trip = uber_data %>% filter(Status == "Cancelled") %>% group_by(Driver.id)
cancelled_trip$is_peakhr = factor(ifelse(cancelled_trip$request_hrs>="05" &
cancelled_trip$request_hrs <="10",
"peakhr_morning",
ifelse(cancelled_trip$request_hrs>="17" &
cancelled_trip$request_hrs <="22",
"peakhr_evening","Not_peakhr")))
#View(cancelled_trip)
supplygap_notrips_plot =
ggplot(data = cancelled_trip, mapping = aes(x = Pickup.point, fill = is_peakhr)) +
geom_bar() + theme(title = element_text(size=9, face="bold")) +
labs(title = "Trip Cancellation duirng the Day from Pickup point",
subtitle ="Summary: Trip Cancellations are high from city during peak morning hours (5-10AM)",
x = "Trip Cancellations Pickup Point ",y ="count of Cab Request")
# clearly cancellations are high in the peak hours ( 5-10AM) morning time and high from city
aggregate(Driver.id~is_peakhr + Pickup.point + request_hrs ,cancelled_trip,length)
## is_peakhr Pickup.point request_hrs Driver.id
## 1 Not_peakhr City 00 3
## 2 Not_peakhr City 01 4
## 3 Not_peakhr City 02 5
## 4 Not_peakhr City 03 2
## 5 Not_peakhr Airport 04 2
## 6 Not_peakhr City 04 49
## 7 peakhr_morning Airport 05 4
## 8 peakhr_morning City 05 172
## 9 peakhr_morning Airport 06 4
## 10 peakhr_morning City 06 141
## 11 peakhr_morning Airport 07 5
## 12 peakhr_morning City 07 164
## 13 peakhr_morning Airport 08 2
## 14 peakhr_morning City 08 176
## 15 peakhr_morning Airport 09 8
## 16 peakhr_morning City 09 167
## 17 peakhr_morning Airport 10 9
## 18 peakhr_morning City 10 53
## 19 Not_peakhr Airport 11 5
## 20 Not_peakhr City 11 10
## 21 Not_peakhr Airport 12 10
## 22 Not_peakhr City 12 9
## 23 Not_peakhr Airport 13 9
## 24 Not_peakhr City 13 9
## 25 Not_peakhr Airport 14 6
## 26 Not_peakhr City 14 5
## 27 Not_peakhr Airport 15 11
## 28 Not_peakhr City 15 10
## 29 Not_peakhr Airport 16 14
## 30 Not_peakhr City 16 8
## 31 peakhr_evening Airport 17 19
## 32 peakhr_evening City 17 16
## 33 peakhr_evening Airport 18 15
## 34 peakhr_evening City 18 9
## 35 peakhr_evening Airport 19 15
## 36 peakhr_evening City 19 9
## 37 peakhr_evening Airport 20 29
## 38 peakhr_evening City 20 12
## 39 peakhr_evening Airport 21 28
## 40 peakhr_evening City 21 14
## 41 peakhr_evening Airport 22 3
## 42 peakhr_evening City 22 9
## 43 Not_peakhr City 23 10
ggplot(data = cancelled_trip, mapping = aes(x = request_hrs, fill = is_peakhr)) +
geom_bar() +
labs(title = "Trip Cancellation during Peak hours To/from City/Airport",
subtitle ="Trip Cancellations are high during 5-10AM in the morning & is peak hour",
x = "Trip Cancellations During the day (hrs) ")

ggplot(data = cancelled_trip, mapping = aes(x = request_hrs, fill = is_peakhr)) +
geom_bar() + facet_wrap(~Pickup.point) +
theme(title = element_text(size=9, face="bold")) +
ggtitle("Trip Cancellation during Peak hours To/from City/Airport") + xlab("Booking hour")

# Again trip cancellations are high in the peak hours morning and from city to airport
supplygap_days_notrips_plot =
ggplot(data = cancelled_trip, mapping = aes(x = request_day, fill = is_peakhr)) +
geom_bar() + facet_wrap(~Pickup.point) +
theme(title = element_text(size=9, face="bold")) +
labs(title = "Trip Cancelled during various days To/from City/Airport",
subtitle ="Summary: Trip Cancellations persist through all days in morning peak hours from city",
x = "Weekdays of July 2016",y ="count of Cab Request")
# At day level peak hour morning cancellations are high from city inline with hours
#cancellations from city
grid.arrange(supplygap_notrips_plot, supplygap_days_notrips_plot, ncol =2)

# ANALYSIS/SUMMARY: Refer plot 6 in PPT:Journey Duration vs. Supply Gap
# From the left plot ??? Trip cancellations from the driver is high for city pickups during
#peak morning hours (peakhr_morn 5:10AM)
#From the Right Plot ??? Trip cancellation is independent of weekdays and persists through all weekdays during peak morning hours
#Trip cancellation is not depend on day and occurs in the morning peak hour every day.
##########################################################################
#############......Plot 7: NO CABS AVAIALBLE......###########################
##########.... (Refer Plot 7 in PPT): Cabs Unavailability.................##############
#Testing hypothesis, the cab unavailability trend during evening peak hours holds good
# by analyzing only peak hour data during all days.
# ASSUMPTIONS/ NOTES:
#Cab unavailable ??? When customer wants to book a trip but cab is unavailable. So Driver ID and
#Drop Time stamp data is unavailable for the analysis
#From the given Uber data set, we extracted data/rows related NO CAB AVAILABLE status only
#and analyzed patterns on this data. Hence, data considered for plots in this slide consists
#of 'NO CAB AVAILABLE' data only
# filtering Status "No Cars Available" rows from uber_data to further analysis
cabs_unavailable = uber_data %>% filter(Status == "No Cars Available") %>% group_by(Driver.id)
cabs_unavailable$is_peakhr = factor(ifelse(cabs_unavailable$request_hrs>="05" &
cabs_unavailable$request_hrs <="10",
"peakhr_morning",
ifelse(cabs_unavailable$request_hrs>="17" &
cabs_unavailable$request_hrs <="22",
"peakhr_evening","Not_peakhr")))
#View(cabs_unavailable)
supplygap_nocabs_plot =
ggplot(data = cabs_unavailable, mapping = aes(x = Pickup.point, fill = is_peakhr)) +
geom_bar() + theme(title = element_text(size=9, face="bold")) +
labs(title = "Cab Unavailability during Peak hours To/from City/Airport",
subtitle ="Summary: Cab Unavailability is high during peak evening hours from Airport",
x = "Pickup Point Cab Unavailability", y = "No.of Cab Requests (count)")
supplygap_nocabs_plot

# clearly cabs are unavailable from Airport during peak evening hours
ggplot(data = cabs_unavailable, mapping = aes(x = request_hrs, fill = is_peakhr)) +
geom_bar() + facet_wrap(~Pickup.point) +
theme(title = element_text(size=9, face="bold")) +
labs(title = "Cab Unavailability during Peak hours To/from City/Airport",
subtitle ="Summary: Cab Unavailability is high during peak evening hours from Airport",
x = "Pickup Point Cab Unavailability", y = "No.of Cabs (count)")

# this plot you can see more clearly the unavailablity during peak evening hours from Aiport
supplygap_days_nocabs_plot =
ggplot(data = cabs_unavailable, mapping = aes(x = request_day, fill = is_peakhr)) +
geom_bar() + facet_wrap(~Pickup.point) +
theme(title = element_text(size=9, face="bold"))+
labs(title = "Cab unavaialble during various days To/from City/Airport",
subtitle ="Summary: Cab Unavailability persists through all days in peak evening hours from Airport",
x = "Weekdays of July 2016", y = "No.of Cab Requests (count)")
grid.arrange(supplygap_nocabs_plot,supplygap_days_nocabs_plot, ncol=2)

# In line with hours data cab unavailability is high from airport during peak hours evening
# ANALYSIS: (Refer Plot 7 in PPT):Cabs Unavailability
#From the left plot ??? The Cabs unavailability is high for Airport pickups during peak
#evening hours (peakhr_evening:5 PM-10 PM)
#From the Right Plot ??? Cabs unavailability is independent of weekdays and persists through
#all weekdays during peak hours evening
# QUESTION:
# Peak hour evening wait time at airport is high. at the same time cabs are not available.
#Possibel Answer:
# Since the the wait time is higher at airport, cab drivers are not willing to accept trips
# to airport. SO cabs are not available.
#########################################################################
# the above plot tells that cab availability is less in the evening from airport to city
# Hence it is recommended that additional incentives required to improve cabs availability
# during evening time and from Airport to city from UBER
# Analysis: to reiterate, morning we have cancellation problem while evening during peak hours
# we have cabs unavailbility issue.
# from above plots there are good supply of cars available from the city to airport(about 2000 cars)
# if we compare with number of request (about 2300 request)
# However, from Airport to City only about 1000 cars avaialble for about 2700 requests
# this is a supply and bigger problem than city pickups where Uber needs to address.
# Question: What do you think is the reason for this issue for the supply-demand gap?
# The demand issue during morning peak hours is very marginal and can be solved easyly.
# most of the cab drives are cancelling during morning peak hours may be due to high
# travel time involved. So provide some additional incentives for city to airport pickup
#out major focus area is Peak hours in the evening where cab avaialbility is very low. So improve
# cab availability in the Airport during evening hours
####################....End of Analysis......###########################
##################################################################################################
#################################################################################################