########################################################################################

########################################################################################

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(stringr)
#install.packages("gridExtra")
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
#setwd("C:/Users/WL0681/OD/IIIT BA Course/C2_Statistics and Data Visualization/M4_Uber Supply_Demand Gap_Assignment")

uber_data = read.csv("Uber Request Data.csv")
str(uber_data)
## 'data.frame':    6745 obs. of  6 variables:
##  $ Request.id       : int  619 867 1807 2532 3112 3879 4270 5510 6248 267 ...
##  $ Pickup.point     : Factor w/ 2 levels "Airport","City": 1 1 2 1 2 1 1 1 2 2 ...
##  $ Driver.id        : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Status           : Factor w/ 3 levels "Cancelled","No Cars Available",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Request.timestamp: Factor w/ 5618 levels "11/7/2016 0:00",..: 81 266 1563 1252 2023 2790 3162 4389 5108 670 ...
##  $ Drop.timestamp   : Factor w/ 2598 levels "11/7/2016 0:51",..: 79 201 923 754 1125 1439 1599 2097 2400 412 ...
#View(uber_data)

# Request ID and Driver.ID Needs to be unique for a request and driver respectively so we need to change int to "factor" data type
#uber_data$Request.id = factor(uber_data$Request.id)
#uber_data$Driver.id = factor(uber_data$Driver.id)

#######################################################################################

########..........DATA CLEANING & PREPARAITION..........#####################

# Standardizing date fomat separator form "/" to "-" and removing seconds as this granular level analysis is not required
uber_data$Request.timestamp = str_replace_all(uber_data$Request.timestamp,"\\/","-")
uber_data$Drop.timestamp = str_replace_all(uber_data$Drop.timestamp,"\\/","-")

# converting into standard R date-time format 
uber_data$Request.timestamp =  as.POSIXct(uber_data$Request.timestamp, format = "%d-%m-%Y %H:%M")
uber_data$Drop.timestamp = as.POSIXct(uber_data$Drop.timestamp, format = "%d-%m-%Y %H:%M")

# creating separatel columns from the date & time: day, month, year, hours and minutes
uber_data$request_day = format(uber_data$Request.timestamp, "%d")
uber_data$request_month = format(uber_data$Request.timestamp, "%m")
uber_data$request_year = format(uber_data$Request.timestamp, "%Y")
uber_data$request_hrs = format(uber_data$Request.timestamp, "%H")
uber_data$request_min = format(uber_data$Request.timestamp, "%M")

# creating drop hours and drop minutes from the drop timestamp
uber_data$drop_hrs = format(uber_data$Drop.timestamp, "%H")
uber_data$drop_min = format(uber_data$Drop.timestamp,"%M")

uber_data$journey_minits = uber_data$Drop.timestamp-uber_data$Request.timestamp


uber_data$is_peakhr = factor(ifelse(uber_data$request_hrs>="05" & 
                                             uber_data$request_hrs <="10",
                                           "peakhr_morning",
                                           ifelse(uber_data$request_hrs>="17" & 
                                                    uber_data$request_hrs <="22",
                                                  "peakhr_evening","Not_peakhr")))
uber_data_backup =uber_data
#View(uber_data_backup)
#write.csv(uber_data_backup,"uber_data_tableau.csv")
#########################################################################################

###########.......UNIVARIATE ANALYSIS................##########################

# We need to find any repetitions in the dataset for each column. It seems same drivers have been doing multiple trips.
apply(uber_data,2,function(x) sum(duplicated(x)))
##        Request.id      Pickup.point         Driver.id            Status 
##                 0              6743              6444              6742 
## Request.timestamp    Drop.timestamp       request_day     request_month 
##              2729              4462              6740              6744 
##      request_year       request_hrs       request_min          drop_hrs 
##              6744              6721              6685              6720 
##          drop_min    journey_minits         is_peakhr 
##              6684              6681              6742
# There are  pickup/airport from and to the  city 
# 3 levels - no cars available, cancelled, trip completed
# each request is unique and this data is of July 2016
apply(uber_data,2,function(x) length(unique(x)))
##        Request.id      Pickup.point         Driver.id            Status 
##              6745                 2               301                 3 
## Request.timestamp    Drop.timestamp       request_day     request_month 
##              4016              2283                 5                 1 
##      request_year       request_hrs       request_min          drop_hrs 
##                 1                24                60                25 
##          drop_min    journey_minits         is_peakhr 
##                61                64                 3
nrow(uber_data)
## [1] 6745
############.......... Uber Bivariate & Segmented variate Analysis..........###########

# hint:  Create plots to visualize the frequency of requests that get cancelled or 
#show 'no cars available'; identify the most problematic types of requests 
#(city to airport / airport to city etc.) and the time slots (early mornings, late evenings etc.)

# Trip cancelled or no cars availability is high compared to trips completed. Uber is loosing revenue
# and >60% of the time customer is unable to obtain service from Uber due to this
ggplot(data = uber_data, mapping = aes(x = Status)) + geom_histogram(stat = "count") +
  labs(title ="Frequency of Status of Cab request", subtitle = "Frequency of Booking Status", x= "Booking reqst in a day (hrs)")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

#ggsave("status_plot.pdf")

########..... (Refer Plot 1 in PPT)  Frequency based on TRIP Status.......######
# From Airport to city there are very less cancellations. But MAJOR PROBLEM is CARS UNAVAILABLE at Airport for pickup
# City to Airport: Lot of cancellations? Why? But other hand trips completed to aiport is high

ggplot(data = uber_data, 
       mapping = aes(x = uber_data$Status, fill = Pickup.point )) + 
  geom_histogram(position = "stack", stat = "count") +
  theme(title = element_text(size=9, face="bold"))+
  labs(title ="Frequency of Status of Cab request", 
       subtitle = "Summary: About 60% of the requests customer is unable to obtain uber service due to cancellation/cab unvailability",
       x= "Booking reqst 'Status' in a day")
## Warning: Ignoring unknown parameters: binwidth, bins, pad

# ANALYSIS: Plot 1- 
# #Cancellations and cab unavailability are persistent in this case. About 60% of  times  
# Airport to city or city to Airport cab request were not catered due to either unavailability or 
# cancellations of the cab booking.
# Also Cab unavailability is high for Airport pickup requests and Cancellations are high for 
# City pickup requests


# Demand is high During 5 AM-10AM and 5PM-10PM during every day are peak hours
ggplot(data = uber_data, mapping = aes(x = request_hrs)) + geom_bar() + 
  labs(title ="Cab Demand/Request Pattern During the day", subtitle = "Demand is high from 5-10AM & 5-10PM", 
       x= "Booking reqst in a day (hrs)", y ="Cab demand pattern by hour")

#ggsave("demand cabs.pdf")

# CANCELLATIONS are high in the morning peak hours compared to Evening peak hours.
# During evening peak hours cab unavailability is high in the evening. 
# Overall, Uber is loosing business
rhrs_status = 
  ggplot(data = uber_data, mapping = aes(x = request_hrs, fill = Status)) +
  geom_bar() +  theme(title = element_text(size=9, face="bold")) +
  labs(title ="Cab Demand/Request Pattern During the day", 
      subtitle = "Peak hrs 5-10AM: High Cancellations, Peak hrs 5-10PM: High Cab unavailability ", 
       x= "Booking reqst in a day (hrs)")
rhrs_status

#ggsave("CabDemand_status.pdf")
rhrs_status_pickuppoint = rhrs_status + facet_wrap(~Pickup.point)


rhrs_pickupPoint = 
  ggplot(data = uber_data, mapping = aes(x = request_hrs, fill = Pickup.point)) +
  geom_bar() + theme(title = element_text(size=9, face="bold")) +
  labs(title ="Cab Demand/Request Pattern by Pickup Point", 
       subtitle = "5-10AM: High Demand at City Pickup (to Airport); 5-10PM: High Demand at Airport (to city) ", 
       x= "Booking reqst in a day (hrs)", caption ="Considering demand 5-10AM & 5-10PM are Problamatic") +
  annotate("text", x= 11, y= 500, col = "blue",label = "During 5-10 AM peak hours requests are high from city and 5-10PM requests/demand is high from Airport")
rhrs_pickupPoint

#ggsave("demandPattern_Pickuppoint.pdf")



grid.arrange(rhrs_status,rhrs_pickupPoint, ncol =2)

# Analaysis:
# During morning peak hours 5AM- 10AM city to Airport request are more
# During evening peak hours 5PM- 9PM  Airport to city request are more
# Driver trip cancelled rate is high in the morning peak hours
# where as, cabs unavailability is high in the evening peak hours 

#..... How journey_minits (Duration of journey)impacting overall business.....


jm_hrs_pickup = ggplot(data = uber_data, mapping = aes(y = journey_minits, x = request_hrs, fill = Pickup.point)) + 
  geom_boxplot() + theme(title = element_text(size=9, face="bold"))+
  labs(title ="Journey Duration pattern in a weekday ", 
       subtitle = "Summary: Journey duration is relatively high to Airport in the morning & high to city in evening", 
       x= "Booking reqstd during the day (Hrs)", y = "Journey Duration") 
jm_hrs_pickup  
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 3914 rows containing non-finite values (stat_boxplot).

mean(uber_data$journey_minits,na.rm = TRUE)
## Time difference of 52.41152 mins
#grid.arrange(rhrs_pickupPoint,rhrs_status,jm_hrs_pickup, ncol= 2,nrow =2)
#ggsave("demand gap.pdf")

# (Refer Plot 2 in PPT) Problematic Area for Uber
grid.arrange(rhrs_status_pickuppoint,jm_hrs_pickup, ncol= 1,nrow =2)
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.
## Warning: Removed 3914 rows containing non-finite values (stat_boxplot).

#ggsave("plot.pdf")

# ANALYSIS: Plot 2 -
# From top plot???  Peak hours are 05-10AM in the morning from the city . Peak hours
# are 05 PM-10 PM in the evening from airport.
# From bottom plot??? At high level the time taken to Airport travel is high in the
# morning peak hours.( Red color box area in the plot)
# So to mitigate supply demand gap Uber needs to have primary focus on peak hours 
# i.e. 05AM-10 AM and 05-PM-10PM time slots. Also should focus on cab cancellations in the morning (City) and Cabs unavailability in the evening peak hours slots.

##########........What is demand & supply in Uber case .?.........########
#1. Demand = number of cab requests done. i.e. requests  done while  
#Trip completed + Trip cancelled + Cabs unavailable
#2. Supply = Number of cab availability. Only trip completed is considered as supply.
# Trip cancelled / cab unavailability ideally did not cater customer so did not consider 
# as part of supply



###########################################################################################

# Why driver trip cancellation rate is high in the morning peak hours
# Ans: The median time taken to airport during morning peak hours for each trip to airport is higher
# compared with rest of the day.
# also journey time to city to airport is higher.


# Why cabs unavailablity is high in the evening peak hours
# Question: Why cab supply is less at Airport in the evening where peak demand exists from 5PM-9PM
# For each driver who is at the aiport what is the waiting time vs. driver from the city to airport

# Find the time slots when the highest gap exists
# 5AM-9AM and 5PM-10PM are the has highest gap exists in demand vs. supply

#Question:
# Find the types of requests (city-airport or airport-city) for which the gap is the most severe 
#in the identified time slots
# Answer:
# From above plot we can handle the cab cancellations (5AM-10AM) with some additional incentives to drivers
# so the driver takes up airport drop.
# Cab unavilability is during (5PM-10PM) in the aiport during this peak period is challenging and
# Uber needs to supply more cabs some how and increase cab availability. Please note that from Airport
# cab cancellations are very less and not a big challenge

############################################################################################

#####.... Finding Cab booking requsts (demand) Vs. Cabs availability (Supply) Gap during 
############          ..... PEAK HOURS (Morning & Evening) .....       ###############
##########......... Cab Supply Demand Gap (Refer Plot  3 in PPT)......############

# Assumption /Note: Only PEAK hours data for the slots 05-10AM & 05-10 PM has been
#considered  and extracted from given Uber data set for analysis


# Extracting peak hours data from uber data to peakhr_data dataframe
peakhr_data = uber_data %>% filter(request_hrs %in% c("05","06","07","08","09","10",
                                                      "17","18","19","20","21","22"))
peakhr_data$is_peakhr = factor(ifelse(peakhr_data$request_hrs>="05" & 
                                        peakhr_data$request_hrs<="10",
                                      "peakhr_morning","peakhr_evening"))


#View(peakhr_data)
# Peakhr_data dervied data frame considered for cab demand calculations
# drivers_trips derived data frame considered for supply demand calculations
# for this data frame we need to subset peak hours data

# demand for cabs during peak hours

# demand is taken from peakhr_data data frame where only peak hours data available
aggregate(Request.id~Pickup.point,peakhr_data,length)
##   Pickup.point Request.id
## 1      Airport       2484
## 2         City       2508
peakhr_cab_demand = aggregate(Request.id~Pickup.point+Status,peakhr_data,length)
names(peakhr_cab_demand) = c("Pickup.point","Status", "cab_demand")
peakhr_cab_demand
##   Pickup.point            Status cab_demand
## 1      Airport         Cancelled        141
## 2         City         Cancelled        942
## 3      Airport No Cars Available       1455
## 4         City No Cars Available        546
## 5      Airport    Trip Completed        888
## 6         City    Trip Completed       1020
# Cab supply  during peak hours

# cab supply is taken from driver_trips (has trip completed data) data frame where 
#only trip completed data available
# obtaining only peak hours data from driver_trip to find supply

driver_trips = uber_data %>% filter(!is.na(Driver.id) & Status =="Trip Completed") %>% 
  group_by(Driver.id)

trips_completed_peakhr = 
  driver_trips %>% 
  filter(request_hrs %in% c("05","06","07","08","09","10",
                            "17","18","19","20","21","22"))
#View(trips_completed_peakhr)

aggregate(Driver.id~Pickup.point,trips_completed_peakhr,length)
##   Pickup.point Driver.id
## 1      Airport       888
## 2         City      1020
peakhr_cab_supply = aggregate(Driver.id~Pickup.point + Status,trips_completed_peakhr,length)
names(peakhr_cab_supply) = c("Pickup.point","Status","cab_supply")
peakhr_cab_supply
##   Pickup.point         Status cab_supply
## 1      Airport Trip Completed        888
## 2         City Trip Completed       1020
#table(peakhr_cab_demand,peakhr_cab_supply)

peakhr_supply_demand= merge(peakhr_cab_demand,peakhr_cab_supply,all.x = TRUE)

#peakhr_supply_demand$Status = NULL
peakhr_supply_demand$cab_supply[which(is.na(peakhr_supply_demand$cab_supply))] =0
peakhr_supply_demand   
##   Pickup.point            Status cab_demand cab_supply
## 1      Airport         Cancelled        141          0
## 2      Airport No Cars Available       1455          0
## 3      Airport    Trip Completed        888        888
## 4         City         Cancelled        942          0
## 5         City No Cars Available        546          0
## 6         City    Trip Completed       1020       1020
# aggregate(Pickup.point~cab_demand,cab_supply,peakhr_supply_demand,sum)
# peakhr_supply_demand %>% group_by(Pickup.point,Status)


# Plotting cab demand and cab supply for finding out supply demand gap

peakhr_cab_demand_plot = ggplot(data = peakhr_supply_demand, 
                                aes(x = Pickup.point,y = cab_demand, fill = Status )) + 
  geom_bar(stat = "identity", position = "stack") +
  theme(title = element_text(size=9, face="bold"))+
  labs(title = "Cab Demand from Pickup Points during Peak hours", 
       subtitle ="Summary: Cab demand during peak hours based on Pickup Point and Status",
       caption ="source: peakhr_data data frame subsetted for peak hours" ,
       x = "Pickup.point", y = "Cab Demand (no of cab reqsts)")
peakhr_cab_demand_plot

peakhr_cab_supply_plot = ggplot(data = peakhr_supply_demand, 
                                aes(x = Pickup.point, y = cab_supply, fill = Status )) + 
  geom_bar(stat = "identity") +
  theme(title = element_text(size=9, face="bold"))+
  labs(title = "Cab Supply from Pickup Points during Peak hours", 
       subtitle ="Summary: Cab supply during peak hours based on Pickup Point and trips completed",
       caption ="source: driver_trips data frame subsetted for peak hours" ,
       x = "Pickup.point", y = "cab supply (no of cab reqsts)") +
  coord_cartesian(ylim = c(0,2500))
peakhr_cab_supply_plot

grid.arrange(peakhr_cab_demand_plot, peakhr_cab_supply_plot, ncol =2)

# ANALYSIS: Plot 3 -
# Case 1: 
#Morning Peak hours (05-10 AM) City Pickups: Cabs demand (requests) is high for 
#city pickups (city to airport) and  high cancellations (refer plot 2)  Uber was able to 
#cater to 41% of the total requests during morning peak hours
# Cab demand & Supply from city:
#Cab  supply is only ~1,020 and cab demand is 2,508. Hence Supply demand gap exists in 
#peak hour morning  from city pickups but lower than evening peak hours


#Case 2: 
#Evening Peak Hours (05-10 PM) Airport Pickups:  Cabs demand (requests) is high for
#Airport pickups (Airport to city). Cab unavailability is high (refer plot 2) and demand 
#supply gap from airport & Peakhr_evening is evident from above plot. Only 36% of total requests
#from Airport were catered during evening peak hours. Hence, Evening from Airport supply demand
#gap is high during peak hours
# cab demand & supply from Airport:
#Cab  supply is only ~888 and cab demand is 2,484. Hence Supply demand gap exists in peak hour 
#evening from Airport pickup


### Demand Supply gap for entire data set - plot 3.1 in PPT###########

########......Cab Supply Demand Gap- Entire Day (plot 3.1 in PPT).....#########

# demand for cabs during entire day ........

# demand is taken from uber_data data frame where entire cab requests data available
aggregate(Request.id~Pickup.point,uber_data,length)
##   Pickup.point Request.id
## 1      Airport       3238
## 2         City       3507
overall_cab_demand = aggregate(Request.id~Pickup.point+Status,uber_data,length)
names(overall_cab_demand) = c("Pickup.point","Status", "overall_cab_demand")
overall_cab_demand
##   Pickup.point            Status overall_cab_demand
## 1      Airport         Cancelled                198
## 2         City         Cancelled               1066
## 3      Airport No Cars Available               1713
## 4         City No Cars Available                937
## 5      Airport    Trip Completed               1327
## 6         City    Trip Completed               1504
# Cab supply  during peak hours

# cab supply is taken from uber_date (has trip completed data) data frame where 
#only trip completed data available
# obtaining only trip completed data from uber_data to find supply
trips_completed_entireday = 
  uber_data %>% filter(Status == "Trip Completed")
#View(trips_completed_entireday)

aggregate(Driver.id~Pickup.point,trips_completed_entireday,length)
##   Pickup.point Driver.id
## 1      Airport      1327
## 2         City      1504
overall_cab_supply = aggregate(Driver.id~Pickup.point + Status,trips_completed_entireday,length)
names(overall_cab_supply) = c("Pickup.point","Status","overall_cab_supply")
overall_cab_supply
##   Pickup.point         Status overall_cab_supply
## 1      Airport Trip Completed               1327
## 2         City Trip Completed               1504
overall_supply_demand= merge(overall_cab_demand,overall_cab_supply,all.x = TRUE)
overall_supply_demand
##   Pickup.point            Status overall_cab_demand overall_cab_supply
## 1      Airport         Cancelled                198                 NA
## 2      Airport No Cars Available               1713                 NA
## 3      Airport    Trip Completed               1327               1327
## 4         City         Cancelled               1066                 NA
## 5         City No Cars Available                937                 NA
## 6         City    Trip Completed               1504               1504
overall_supply_demand$overall_cab_supply[which(is.na(overall_supply_demand$overall_cab_supply))] =0
overall_supply_demand   
##   Pickup.point            Status overall_cab_demand overall_cab_supply
## 1      Airport         Cancelled                198                  0
## 2      Airport No Cars Available               1713                  0
## 3      Airport    Trip Completed               1327               1327
## 4         City         Cancelled               1066                  0
## 5         City No Cars Available                937                  0
## 6         City    Trip Completed               1504               1504
# Plotting overall cab demand and supply gap

overall_cab_demand_plot = ggplot(data = overall_supply_demand, 
                                 aes(x = Pickup.point,y = overall_cab_demand, fill = Status )) + 
  geom_bar(stat = "identity", position = "stack") +
  theme(title = element_text(size=9, face="bold"))+
  labs(title = "Cab Demand of entire day from Pickup Points", 
       subtitle ="Summary: Entire day cab demand is based on Pickup Point and Status",
       caption ="source: uber_data data frame subsetted for peak hours" ,
       x = "Pickup.point", y = "overall Cab Demand (No. of cab reqsts for entire day)")
overall_cab_demand_plot

overall_cab_supply_plot = ggplot(data = overall_supply_demand, 
                                 aes(x = Pickup.point, y = overall_cab_supply, fill = Status )) + 
  geom_bar(stat = "identity") +
  theme(title = element_text(size=9, face="bold"))+
  labs(title = "Cab Supply for entire from Pickup Points", 
       subtitle ="Summary: Entire day Cab supply is based on Pickup Point and trips completed",
       caption ="source: uber_data data frame subsetted for Trip completed" ,
       x = "Pickup.point", y = "overall cab supply (No.of cab reqsts for entire day") +
  coord_cartesian(ylim = c(0,3500))
overall_cab_supply_plot

grid.arrange(overall_cab_demand_plot, overall_cab_supply_plot, ncol =2)

# ANALYSIS: (plot 3.1 from PPT)
#Case 1 : City Pickups: Uber was able to cater to 42% of the customer requests during entire day. 
#During peak hours (from plot 3) Uber was able to cater to 41% of the customer requests
# cab demand and supply from city:
# Cab  supply is only ~1,504 and cab demand is 3,507. Hence Supply demand gap exists for
#city pick ups as well


#Case 2: Airport Pickups: Uber was able to cater to 40% of the customer requests during entire
#day for the data given period i.e. 11th to 15th July 2016. During peak hours only 36% of the 
#request are catered.
# cab demand and supply from airport:
# Cab  supply is only ~1,327 and cab demand is 3,238. Hence Supply demand gap exists from 
#Airport pickups

# Conclusion: Demand and supply gap exists for city and airport pick ups during entire day of 
#operation. However, during peak hours evening Airport pickups are challenging and 
#supply demand gap is acute.

##############################################################################################

#########... TIME SPENT AT AIRPORT or in CIty WITHOUT A TRIP (IDLE TIME AT AIRPORT/City)...##########
#######....Refer plot 4 in PPT: Wating time patterns at Airport .......######

#KEY ASSUMPTION:
# only city to airport and airport to city data available. It was assumed that driver only
# does Airport/city pickups which is not true in real world. In between each Airport/city pickup
# there will be near by trips.However, as this data not available we are assuming drivers do not
# take any near by trips and only tool Airport to City or city to airport trips

driver_trips = data.frame(driver_trips)

trip_count = data.frame(table(driver_trips$Driver.id))[,2]

#View(driver_trips)

# declaring empty vectors for catching  variables inside user defined function idle_time
waittime = vector()
d_id = vector()
temp = vector()
rownum = vector()
index =0

#Idle_time_airport  = data.frame(idle_time(driver_trips))

# Derving idle time based on pickup and drop point
idle_time = function(dt, pickup_point,drop_point) {
  
  for (i in 1:length(trip_count)) {
    
    for(j in 1:trip_count[i]) {
      k = sum(index,j)
      
      if(dt$Pickup.point[k] == pickup_point & dt$Pickup.point[k+1] == drop_point) {
        #print(k)
        timediff = difftime(dt$Request.timestamp[k+1], dt$Drop.timestamp[k], units = "hours") 
        idletime_mins =  str_extract_all(timediff,"\\d+", simplify = TRUE)[,1]
        #print(idletime_mins)
        waittime[length(waittime)+1] = idletime_mins
        d_id[length(d_id)+1] = i
        rownum[length(rownum)+1]= index+j
      }
    }
    temp[i] = trip_count[i]
    index = sum(temp)
    
  } 
  
  dat_frame = data.frame(rownum, d_id,waittime)
  #write.csv(dat_frame, file = "mywork.csv")
  return(dat_frame)
}

# function call for Identifying idle time at the airport
idle_time_airport = idle_time(driver_trips,"City","Airport")
#idle_time_airport
#Assumption: We have > 14 hours of waiting time at the airport which is not recommended/ have very less probability
# so removing waiting hours which are >14hrs considering as outliers

idle_time_airport$waittime = as.numeric(as.character(idle_time_airport$waittime))
# ASSUMPTION: Max. Waiting time taken as 14 Hrs
idle_time_airport = idle_time_airport[which(idle_time_airport$waittime <= 14),]
#write.csv(idle_time_airport,"idletime_airport.csv")
# obtaining aggregation on Driver.ID to find out avg wait time
mean_waittime_airport = round(aggregate(waittime~d_id, idle_time_airport,mean),1)
median(idle_time_airport$waittime)
## [1] 5
# Refer Plot 4: Waiting time patterns at Airport
# plotting waiting time at airport distribtuion for each driver 
airport_idletime_plot = ggplot(data = idle_time_airport, aes(x = waittime, y = d_id, group = waittime )) + geom_boxplot() + 
  geom_point(aes(x=mean(waittime)), col = "red") + 
  theme(title = element_text(size=9, face="bold")) +
  labs(title = "Each Driver waiting time pattern at Airpot with Mean waiting time",
       subtitle ="Summary: Each Driver's waiting time at Airport is very irregular ranging from 0HRs to 14HRs",
       x = "Waiting time at Airport in Hrs",y ="Driver ID/Each Driver")
airport_idletime_plot

########################################################################################

#####---- Identify what time of the day waiting time is more------#######
##### Waiting Time Patterns at Airport (Refer Plot 4 in PPT)

# Assumptions / Note: 
# Waiting time in the Airport in plot 1 arrived based on 
# 1. The difference between drop time (drop time stamp) of  a city pick up and  
# pick up(request.timestamp) time of city pickup
# 2. After a city to Airport trip is completed driver waited at the airport for
# the next pick up to the city
# 3. Maximum wait time of <=14 hours at the airport has been assumed. The chances 
# of getting a trip is less if he is unable to get a trip during peak hours. 
# E.g. driver dropped at 6 AM if he is not getting a trip till 8 PM of the same day probability 
# of getting an trip from Airport to city is marginal

rowindex = idle_time_airport$rownum

#Choosing the data/rows from driver_trips (contains all trip completed data) based on row numbers 
#extracted from idle_time_airport data frame (contains idel time for each driver 
#who did Airport pickup and concered row numbers)
df_waittime_airport = driver_trips[rowindex,]

df_waittime_airport$idle_time = idle_time_airport$waittime


df_waittime_airport$is_peakhr = factor(ifelse(df_waittime_airport$request_hrs>="05" & 
                                                df_waittime_airport$request_hrs <="10",
                                      "peakhr_morning",
                                      ifelse(df_waittime_airport$request_hrs>="17" & 
                                               df_waittime_airport$request_hrs <="22",
                                             "peakhr_evening","Not_peakhr")))

#View(df_waittime_airport)

peakhr_wait_plot = ggplot(df_waittime_airport, aes(x = df_waittime_airport$request_hrs, 
                                y = df_waittime_airport$idle_time, fill = is_peakhr)) +
  geom_boxplot() + theme(title = element_text(size=9, face="bold")) +
  labs(title = "Airport Waiting time pattern during the day",
       subtitle ="Summary: Median wating time for new booking is relatively high during evening peak hours",
       x = "Cab Booking Reqsted during day (hrs)",y ="Waiting time at Ariport for booking(Hrs)")

  
grid.arrange(airport_idletime_plot,peakhr_wait_plot, ncol = 2)

#ANALYSIS: (Refer Plot 4 in PPT): Waiting time patterns at Airport
# From the left plot ??? The waiting time of each of the driver is very irregular varying
# from 0hrs to 15hrs. 
# From right plot ??? Median waiting time during peak evening hours is relatively high
# and time variation in box plot is high during pea hours. These reasons and uncertainty around wait time might be discouraging drivers to take up trips to and from airport
# Driver had to wait more than 5 hours in the airport for the booking also during peak hours


#########################################################################################

#####........ for completed trips, Journey time during peak hours.......#######
############ refer plot 5 in PPT: Journey Duration vs. Supply Gap ###################

# ASSUMPTION / NOTE:
#Journey Duration is the time difference in minutes between pick up and drop time
#of a specific trip/request id

driver_trips$is_peakhr = factor(ifelse(driver_trips$request_hrs>="05" & 
                                       driver_trips$request_hrs <="10",
                                      "peakhr_morning",
                                              ifelse(driver_trips$request_hrs>="17" & 
                                                     driver_trips$request_hrs <="22",
                                                     "peakhr_evening","Not_peakhr")))

##### ... Please refer Plot 5 in ppt
ggplot(driver_trips, mapping = aes(x= request_hrs, y = journey_minits, fill = is_peakhr)) +
  geom_boxplot() + facet_wrap(~Pickup.point) +
  theme(title = element_text(size=9, face="bold")) +
  labs(title = "Journey Duration Airport vs. City Pickups",
       subtitle ="Summary: During peak hours Journey duration is same for both Airport and city pickups",
       x = "Cab Booking hours in a day",y ="Journey Duraion in (minutes)")
## Don't know how to automatically pick scale for object of type difftime. Defaulting to continuous.

# ggplot(driver_trips, aes(x= is_peakhr, y= journey_minits)) + geom_boxplot()
# ggplot(driver_trips, aes(x= Pickup.point, y= journey_minits)) + geom_boxplot()

#ANALYSIS: (refer plot 5 in PPT:Journey Duration vs. Supply Gap)
#From the plot below the journey duration during peak morning is relatively high compared 
#to  peak evening hours. Hence, it is most likely that chances of driver cancelling/not 
#interested to go to airport is high due to high journey duration

#########################################################################################

######...... Trip cancelled by Driver.....##################
########......Refer PLOT 6 in PPT: TRIP Cancellation ........###########
# Assumptions/Note:
#Trip Cancellation ??? When driver cancels a pickup request from the customer.
#This case all  the data in the given dataset is for trip cancelled status except drop time of a trip which will not be available as trip got cancelled.
#From the given Uber data set, we extracted data/rows related TRIP CANCELLATIONS status 
#only and analyzed patterns on this data. Hence, data considered for plots in this slide
#consists of TRIP cancellation data only

# Testing hypothesis,  the cab cancellation trend during morning peak hours holds good by
#analyzing only peak hour data during all days.

# filtering Status "Trip Cancelled" rows from uber_data to further analysis
cancelled_trip = uber_data %>% filter(Status == "Cancelled") %>% group_by(Driver.id)

cancelled_trip$is_peakhr = factor(ifelse(cancelled_trip$request_hrs>="05" & 
                                         cancelled_trip$request_hrs <="10",
                                       "peakhr_morning",
                                       ifelse(cancelled_trip$request_hrs>="17" & 
                                              cancelled_trip$request_hrs <="22",
                                              "peakhr_evening","Not_peakhr")))

#View(cancelled_trip)



supplygap_notrips_plot =
ggplot(data = cancelled_trip, mapping = aes(x = Pickup.point, fill = is_peakhr)) + 
  geom_bar() + theme(title = element_text(size=9, face="bold")) +
  labs(title = "Trip Cancellation duirng the Day from Pickup point",
       subtitle ="Summary: Trip Cancellations are high from city during peak morning hours (5-10AM)",
       x = "Trip Cancellations Pickup Point ",y ="count of Cab Request")
# clearly cancellations are high in the peak hours ( 5-10AM) morning time and high from city

aggregate(Driver.id~is_peakhr + Pickup.point + request_hrs ,cancelled_trip,length)
##         is_peakhr Pickup.point request_hrs Driver.id
## 1      Not_peakhr         City          00         3
## 2      Not_peakhr         City          01         4
## 3      Not_peakhr         City          02         5
## 4      Not_peakhr         City          03         2
## 5      Not_peakhr      Airport          04         2
## 6      Not_peakhr         City          04        49
## 7  peakhr_morning      Airport          05         4
## 8  peakhr_morning         City          05       172
## 9  peakhr_morning      Airport          06         4
## 10 peakhr_morning         City          06       141
## 11 peakhr_morning      Airport          07         5
## 12 peakhr_morning         City          07       164
## 13 peakhr_morning      Airport          08         2
## 14 peakhr_morning         City          08       176
## 15 peakhr_morning      Airport          09         8
## 16 peakhr_morning         City          09       167
## 17 peakhr_morning      Airport          10         9
## 18 peakhr_morning         City          10        53
## 19     Not_peakhr      Airport          11         5
## 20     Not_peakhr         City          11        10
## 21     Not_peakhr      Airport          12        10
## 22     Not_peakhr         City          12         9
## 23     Not_peakhr      Airport          13         9
## 24     Not_peakhr         City          13         9
## 25     Not_peakhr      Airport          14         6
## 26     Not_peakhr         City          14         5
## 27     Not_peakhr      Airport          15        11
## 28     Not_peakhr         City          15        10
## 29     Not_peakhr      Airport          16        14
## 30     Not_peakhr         City          16         8
## 31 peakhr_evening      Airport          17        19
## 32 peakhr_evening         City          17        16
## 33 peakhr_evening      Airport          18        15
## 34 peakhr_evening         City          18         9
## 35 peakhr_evening      Airport          19        15
## 36 peakhr_evening         City          19         9
## 37 peakhr_evening      Airport          20        29
## 38 peakhr_evening         City          20        12
## 39 peakhr_evening      Airport          21        28
## 40 peakhr_evening         City          21        14
## 41 peakhr_evening      Airport          22         3
## 42 peakhr_evening         City          22         9
## 43     Not_peakhr         City          23        10
ggplot(data = cancelled_trip, mapping = aes(x = request_hrs, fill = is_peakhr)) + 
  geom_bar() +
  labs(title = "Trip Cancellation during Peak hours To/from City/Airport", 
       subtitle ="Trip Cancellations are high during 5-10AM in the morning & is peak hour",
       x = "Trip Cancellations During the day (hrs) ")

ggplot(data = cancelled_trip, mapping = aes(x = request_hrs, fill = is_peakhr)) + 
  geom_bar() + facet_wrap(~Pickup.point) + 
  theme(title = element_text(size=9, face="bold")) +
  ggtitle("Trip Cancellation during Peak hours To/from City/Airport") + xlab("Booking hour")

# Again trip cancellations are high in the peak hours morning and from city to airport



supplygap_days_notrips_plot =
ggplot(data = cancelled_trip, mapping = aes(x = request_day, fill = is_peakhr)) + 
  geom_bar() + facet_wrap(~Pickup.point) + 
  theme(title = element_text(size=9, face="bold")) +
  labs(title = "Trip Cancelled during various days To/from City/Airport", 
       subtitle ="Summary: Trip Cancellations persist through all days in morning peak hours from city",
       x = "Weekdays of July 2016",y ="count of Cab Request")
# At day level peak hour morning cancellations are high from city inline with hours
#cancellations from city

grid.arrange(supplygap_notrips_plot, supplygap_days_notrips_plot, ncol =2)

# ANALYSIS/SUMMARY: Refer plot 6 in PPT:Journey Duration vs. Supply Gap

# From the left plot ??? Trip cancellations from the driver is high for city pickups during
#peak morning hours (peakhr_morn 5:10AM)
#From the Right Plot ??? Trip cancellation is independent of weekdays and persists through all weekdays during peak morning hours
#Trip cancellation is not depend on day and occurs in the morning peak hour every day.


##########################################################################

#############......Plot 7: NO CABS AVAIALBLE......###########################
##########.... (Refer Plot 7 in PPT): Cabs Unavailability.................##############

#Testing hypothesis,  the cab unavailability trend during evening peak hours holds good 
# by analyzing only peak hour data during all days.

# ASSUMPTIONS/ NOTES:
#Cab unavailable ??? When customer wants to book a trip but cab is unavailable. So Driver ID and
#Drop Time stamp data is unavailable for the analysis
#From the given Uber data set, we extracted data/rows related NO CAB AVAILABLE status only 
#and analyzed patterns on this data. Hence, data considered for plots in this slide consists
#of 'NO CAB AVAILABLE' data only

# filtering Status "No Cars Available" rows from uber_data to further analysis 
cabs_unavailable = uber_data %>% filter(Status == "No Cars Available") %>% group_by(Driver.id)
cabs_unavailable$is_peakhr = factor(ifelse(cabs_unavailable$request_hrs>="05" & 
                                           cabs_unavailable$request_hrs <="10",
                                           "peakhr_morning",
                                            ifelse(cabs_unavailable$request_hrs>="17" & 
                                                   cabs_unavailable$request_hrs <="22",
                                                   "peakhr_evening","Not_peakhr")))

#View(cabs_unavailable)
supplygap_nocabs_plot = 
  ggplot(data = cabs_unavailable, mapping = aes(x = Pickup.point, fill = is_peakhr)) + 
  geom_bar() + theme(title = element_text(size=9, face="bold")) +
  labs(title = "Cab Unavailability during Peak hours To/from City/Airport", 
       subtitle ="Summary: Cab Unavailability is high during peak evening hours from Airport",
       x = "Pickup Point Cab Unavailability", y = "No.of Cab Requests (count)")
supplygap_nocabs_plot

# clearly cabs are unavailable from Airport during peak evening hours

ggplot(data = cabs_unavailable, mapping = aes(x = request_hrs, fill = is_peakhr)) + 
  geom_bar() + facet_wrap(~Pickup.point) + 
  theme(title = element_text(size=9, face="bold")) +
  labs(title = "Cab Unavailability during Peak hours To/from City/Airport", 
       subtitle ="Summary: Cab Unavailability is high during peak evening hours from Airport",
       x = "Pickup Point Cab Unavailability", y = "No.of Cabs (count)")

# this plot you can see more clearly the unavailablity during peak evening hours from Aiport


supplygap_days_nocabs_plot =
  ggplot(data = cabs_unavailable, mapping = aes(x = request_day, fill = is_peakhr)) + 
  geom_bar() + facet_wrap(~Pickup.point) + 
  theme(title = element_text(size=9, face="bold"))+
  labs(title = "Cab unavaialble during various days To/from City/Airport", 
       subtitle ="Summary: Cab Unavailability persists through all days in peak evening hours from Airport",
       x = "Weekdays of July 2016", y = "No.of Cab Requests (count)")

grid.arrange(supplygap_nocabs_plot,supplygap_days_nocabs_plot, ncol=2)

# In line with hours data cab unavailability is high from airport during peak hours evening

# ANALYSIS: (Refer Plot 7 in PPT):Cabs Unavailability
#From the left plot ??? The Cabs unavailability is high for Airport pickups during peak
#evening hours (peakhr_evening:5 PM-10 PM)
#From the Right Plot ??? Cabs unavailability is independent of weekdays and persists through
#all weekdays during peak hours evening 

# QUESTION:
# Peak hour evening wait time at airport is high. at the same time cabs are not available.
#Possibel Answer:
# Since the the wait time is higher at airport, cab drivers are not willing to accept trips
# to airport. SO cabs are not available.

#########################################################################

# the above plot tells that cab availability is less in the evening from airport to city
# Hence it is recommended that additional incentives required to improve cabs availability
# during evening time and from Airport to city from UBER

# Analysis: to reiterate, morning we have cancellation problem while evening during peak hours
# we have cabs unavailbility issue.
# from above plots there are good supply of cars available from the city to airport(about 2000 cars)
# if we compare with number of request (about 2300 request)
# However, from Airport to City only about 1000 cars avaialble for about 2700 requests
# this is a supply and bigger problem than city pickups where Uber needs to address.


# Question: What do you think is the reason for this issue for the supply-demand gap? 
# The demand issue during morning peak hours is very marginal and can be solved easyly.
# most of the cab drives are cancelling during morning peak hours may be due to high
# travel time involved. So provide some additional incentives for city to airport pickup

#out major focus area is Peak hours in the evening where cab avaialbility is very low. So improve
# cab availability in the Airport during evening hours


####################....End of Analysis......###########################

##################################################################################################

#################################################################################################