# load r packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)

install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
# Establish general directory
getwd()
## [1] "/cloud/project"
# List files in specific directory
list.files ("/cloud/project/data")
## [1] "ncr_ride_bookings.csv"
# Load dataset
uber <- read.csv("/cloud/project/data/ncr_ride_bookings.csv")
head(uber)
##         Date     Time   Booking.ID  Booking.Status  Customer.ID  Vehicle.Type
## 1 2024-03-23 12:29:38 "CNR5884300" No Driver Found "CID1982111"         eBike
## 2 2024-11-29 18:01:39 "CNR1326809"      Incomplete "CID4604802"      Go Sedan
## 3 2024-08-23 08:56:10 "CNR8494506"       Completed "CID9202816"          Auto
## 4 2024-10-21 17:17:25 "CNR8906825"       Completed "CID2610914" Premier Sedan
## 5 2024-09-16 22:08:00 "CNR1950162"       Completed "CID9933542"          Bike
## 6 2024-02-06 09:44:56 "CNR4096693"       Completed "CID4670564"          Auto
##       Pickup.Location     Drop.Location Avg.VTAT Avg.CTAT
## 1         Palam Vihar           Jhilmil     null     null
## 2       Shastri Nagar Gurgaon Sector 56      4.9     14.0
## 3             Khandsa     Malviya Nagar     13.4     25.8
## 4 Central Secretariat          Inderlok     13.1     28.5
## 5    Ghitorni Village       Khan Market      5.3     19.6
## 6               AIIMS       Narsinghpur      5.1     18.1
##   Cancelled.Rides.by.Customer Reason.for.cancelling.by.Customer
## 1                        null                              null
## 2                        null                              null
## 3                        null                              null
## 4                        null                              null
## 5                        null                              null
## 6                        null                              null
##   Cancelled.Rides.by.Driver Driver.Cancellation.Reason Incomplete.Rides
## 1                      null                       null             null
## 2                      null                       null                1
## 3                      null                       null             null
## 4                      null                       null             null
## 5                      null                       null             null
## 6                      null                       null             null
##   Incomplete.Rides.Reason Booking.Value Ride.Distance Driver.Ratings
## 1                    null          null          null           null
## 2       Vehicle Breakdown           237          5.73           null
## 3                    null           627         13.58            4.9
## 4                    null           416         34.02            4.6
## 5                    null           737         48.21            4.1
## 6                    null           316          4.85            4.1
##   Customer.Rating Payment.Method
## 1            null           null
## 2            null            UPI
## 3             4.9     Debit Card
## 4             5.0            UPI
## 5             4.3            UPI
## 6             4.6            UPI
#Vehicle type with highest average ride distance and booking value?
# Step1: load dplyr package
library (dplyr)

# Step2: convert to numeric
uber <- uber %>%
  mutate(
    Ride.Distance=as.numeric(Ride.Distance),
    Booking.Value=as.numeric(Booking.Value)
  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Ride.Distance = as.numeric(Ride.Distance)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Step 3: aggregate and group
busiest_vehicle <- uber %>%
  filter(!is.na(Ride.Distance),!is.na(Booking.Value))%>%
  group_by(Vehicle.Type) %>%
  summarise(
    Avg_ride_dist=mean(Ride.Distance),
    Avg_booking_value=mean(Booking.Value)
    ,.groups='drop')%>%
  arrange(desc(Avg_ride_dist),desc(Avg_booking_value))

# Step 4: print results
print(busiest_vehicle)
## # A tibble: 7 × 3
##   Vehicle.Type  Avg_ride_dist Avg_booking_value
##   <chr>                 <dbl>             <dbl>
## 1 eBike                  25.0              504.
## 2 Bike                   24.6              510.
## 3 Auto                   24.6              507.
## 4 Go Mini                24.6              508.
## 5 Go Sedan               24.6              512.
## 6 Premier Sedan          24.6              510.
## 7 Uber XL                24.4              502.
# Step 5: visualize results
library(ggplot2)
ggplot (data=busiest_vehicle,mapping=aes(x=Vehicle.Type,y=Avg_ride_dist))+
  geom_col(color='black',fill='lightblue', position='stack')+
  labs(title='Busiest Vehicle Type by Booking and Ride Distance')+
  theme(axis.text.x = element_text(angle=45,hjust = 1))

# What is the cancellation rate by customers versus drivers across different pickup locations?
#step1: convert to numeric
uber <- uber %>%
  mutate(
    Cancelled.Rides.by.Customer=as.numeric(Cancelled.Rides.by.Customer),
    Cancelled.Rides.by.Driver=as.numeric(Cancelled.Rides.by.Driver)
  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Cancelled.Rides.by.Customer =
##   as.numeric(Cancelled.Rides.by.Customer)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#Step2: calculate rate
cancellation_rate <- uber %>%
  group_by(Pickup.Location)%>%
  summarise(
      total_rides=n(),
    total_customers=(sum(Cancelled.Rides.by.Customer,na.rm=TRUE)/total_rides)*100,
total_drivers=(sum(Cancelled.Rides.by.Driver,na.rm=TRUE)/total_rides)*100
  ) %>%
  slice(1:10)

# Step 3: print results
print(cancellation_rate)
## # A tibble: 10 × 4
##    Pickup.Location  total_rides total_customers total_drivers
##    <chr>                  <int>           <dbl>         <dbl>
##  1 AIIMS                    918            6.54          19.2
##  2 Adarsh Nagar             858            7.23          16.4
##  3 Akshardham               839            9.30          20.3
##  4 Ambience Mall            873            6.64          17.3
##  5 Anand Vihar              859            6.64          16.5
##  6 Anand Vihar ISBT         831            8.30          16.8
##  7 Ardee City               861            7.43          16.7
##  8 Arjangarh                810            9.14          16.7
##  9 Ashok Park Main          884            6.56          17.1
## 10 Ashok Vihar              796            6.28          18.0
# How do average VTAT (vehicle turnaround time) & CTAT (customer turnaround 
# time) vary by vehicle type?

# Step1: convert to numeric
uber <- uber %>%
  mutate(
    Avg.VTAT = as.numeric(Avg.VTAT),
    Avg.CTAT = as.numeric(Avg.CTAT)
  )
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Avg.VTAT = as.numeric(Avg.VTAT)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Step2: aggregate vehical stats
vehicle_stats <- uber %>%
  group_by (Vehicle.Type) %>%
  summarise(
    Average_vtat = mean(Avg.VTAT,na.rm=TRUE),
    Average_ctat = mean(Avg.CTAT,na.rm=TRUE)
  )
# Step3: print results
print(vehicle_stats)
## # A tibble: 7 × 3
##   Vehicle.Type  Average_vtat Average_ctat
##   <chr>                <dbl>        <dbl>
## 1 Auto                  8.45         29.1
## 2 Bike                  8.50         29.2
## 3 Go Mini               8.47         29.2
## 4 Go Sedan              8.40         29.0
## 5 Premier Sedan         8.44         29.2
## 6 Uber XL               8.58         29.2
## 7 eBike                 8.48         29.2
# What are the top three most common reasons customers cancel rides?

# Step1: aggregate cancel reasons
cancelled_rides <-  uber %>%
  filter(!is.na(Reason.for.cancelling.by.Customer))%>%
  group_by (Reason.for.cancelling.by.Customer) %>%
  summarise(count=n(),.groups='drop')

# Step2: print results
print(cancelled_rides)
## # A tibble: 6 × 2
##   Reason.for.cancelling.by.Customer             count
##   <chr>                                         <int>
## 1 AC is not working                              1155
## 2 Change of plans                                2353
## 3 Driver asked to cancel                         2295
## 4 Driver is not moving towards pickup location   2335
## 5 Wrong Address                                  2362
## 6 null                                         139500
# Step3: visualize results
ggplot(data=cancelled_rides,mapping=aes(x=Reason.for.cancelling.by.Customer,y=count))+
  geom_col(color='black',fill='green',)+
  labs(title='top three most common reasons customers cancel rides')

# Do incomplete rides occur more frequently in certain pickup locations?
# step1: aggregate and group by pick up location
location_status <-  uber %>%
  group_by (Pickup.Location)%>%
  summarise(count=n(),.groups='drop')%>%
  arrange(desc(count))%>%
  slice(1:10)

# step2: print results
print(location_status)
## # A tibble: 10 × 2
##    Pickup.Location  count
##    <chr>            <int>
##  1 Khandsa            949
##  2 Barakhamba Road    946
##  3 Saket              931
##  4 Badarpur           921
##  5 Pragati Maidan     920
##  6 Madipur            919
##  7 AIIMS              918
##  8 Mehrauli           915
##  9 Dwarka Sector 21   914
## 10 Pataudi Chowk      907
#How does booking value correlate with ride distance across all bookings?
# calculate correlation
corr_result <- cor(uber$Booking.Value,uber$Ride.Distance,use='complete.obs')

# print results
print(corr_result)
## [1] 0.005173977
# Which payment method is used most often by vehicle type?
# Aggregate and group by vehicle type
popular_payment <- uber %>%
  group_by (Vehicle.Type)%>%
  summarise(count=n(),.groups='drop')%>%
  arrange(desc(count))

# print results
print (popular_payment)
## # A tibble: 7 × 2
##   Vehicle.Type  count
##   <chr>         <int>
## 1 Auto          37419
## 2 Go Mini       29806
## 3 Go Sedan      27141
## 4 Bike          22517
## 5 Premier Sedan 18111
## 6 eBike         10557
## 7 Uber XL        4449
#visualize results
ggplot(data=popular_payment,mapping=aes(x=Vehicle.Type,y=count))+
  geom_col(color='black',fill='orange')+
  labs(title='Payment Type by Vehicle Type')

# What time of day sees the highest booking cancellations by drivers?
busiest_time <- uber %>%
  group_by(Time) %>%
  summarise(cancel_total=sum(Cancelled.Rides.by.Driver,na.rm=TRUE),.groups='drop')%>%
  arrange(desc(cancel_total))%>%
  slice(1:5)

# print results
print (busiest_time)
## # A tibble: 5 × 2
##   Time     cancel_total
##   <chr>           <dbl>
## 1 18:40:56            6
## 2 09:46:34            5
## 3 10:36:44            5
## 4 19:30:47            5
## 5 08:15:05            4
# visualize results
ggplot(data=busiest_time,mapping=aes(x=Time,y=cancel_total,size=cancel_total))+
         geom_point()+
  labs(title='Time with the highest booking cancellations')

#What are the top most common reasons customers cancel rides?
cancel_reasons <- uber %>%
  group_by (Reason.for.cancelling.by.Customer)%>%
  summarise(count=n(),.groups='drop')%>%
  arrange(desc(count))

# print results
print(cancel_reasons)
## # A tibble: 6 × 2
##   Reason.for.cancelling.by.Customer             count
##   <chr>                                         <int>
## 1 null                                         139500
## 2 Wrong Address                                  2362
## 3 Change of plans                                2353
## 4 Driver is not moving towards pickup location   2335
## 5 Driver asked to cancel                         2295
## 6 AC is not working                              1155