# load r packages
install.packages("tidyverse")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(dplyr)
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.5'
## (as 'lib' is unspecified)
library(ggplot2)
# Establish general directory
getwd()
## [1] "/cloud/project"
# List files in specific directory
list.files ("/cloud/project/data")
## [1] "ncr_ride_bookings.csv"
# Load dataset
uber <- read.csv("/cloud/project/data/ncr_ride_bookings.csv")
head(uber)
## Date Time Booking.ID Booking.Status Customer.ID Vehicle.Type
## 1 2024-03-23 12:29:38 "CNR5884300" No Driver Found "CID1982111" eBike
## 2 2024-11-29 18:01:39 "CNR1326809" Incomplete "CID4604802" Go Sedan
## 3 2024-08-23 08:56:10 "CNR8494506" Completed "CID9202816" Auto
## 4 2024-10-21 17:17:25 "CNR8906825" Completed "CID2610914" Premier Sedan
## 5 2024-09-16 22:08:00 "CNR1950162" Completed "CID9933542" Bike
## 6 2024-02-06 09:44:56 "CNR4096693" Completed "CID4670564" Auto
## Pickup.Location Drop.Location Avg.VTAT Avg.CTAT
## 1 Palam Vihar Jhilmil null null
## 2 Shastri Nagar Gurgaon Sector 56 4.9 14.0
## 3 Khandsa Malviya Nagar 13.4 25.8
## 4 Central Secretariat Inderlok 13.1 28.5
## 5 Ghitorni Village Khan Market 5.3 19.6
## 6 AIIMS Narsinghpur 5.1 18.1
## Cancelled.Rides.by.Customer Reason.for.cancelling.by.Customer
## 1 null null
## 2 null null
## 3 null null
## 4 null null
## 5 null null
## 6 null null
## Cancelled.Rides.by.Driver Driver.Cancellation.Reason Incomplete.Rides
## 1 null null null
## 2 null null 1
## 3 null null null
## 4 null null null
## 5 null null null
## 6 null null null
## Incomplete.Rides.Reason Booking.Value Ride.Distance Driver.Ratings
## 1 null null null null
## 2 Vehicle Breakdown 237 5.73 null
## 3 null 627 13.58 4.9
## 4 null 416 34.02 4.6
## 5 null 737 48.21 4.1
## 6 null 316 4.85 4.1
## Customer.Rating Payment.Method
## 1 null null
## 2 null UPI
## 3 4.9 Debit Card
## 4 5.0 UPI
## 5 4.3 UPI
## 6 4.6 UPI
#Vehicle type with highest average ride distance and booking value?
# Step1: load dplyr package
library (dplyr)
# Step2: convert to numeric
uber <- uber %>%
mutate(
Ride.Distance=as.numeric(Ride.Distance),
Booking.Value=as.numeric(Booking.Value)
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Ride.Distance = as.numeric(Ride.Distance)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Step 3: aggregate and group
busiest_vehicle <- uber %>%
filter(!is.na(Ride.Distance),!is.na(Booking.Value))%>%
group_by(Vehicle.Type) %>%
summarise(
Avg_ride_dist=mean(Ride.Distance),
Avg_booking_value=mean(Booking.Value)
,.groups='drop')%>%
arrange(desc(Avg_ride_dist),desc(Avg_booking_value))
# Step 4: print results
print(busiest_vehicle)
## # A tibble: 7 × 3
## Vehicle.Type Avg_ride_dist Avg_booking_value
## <chr> <dbl> <dbl>
## 1 eBike 25.0 504.
## 2 Bike 24.6 510.
## 3 Auto 24.6 507.
## 4 Go Mini 24.6 508.
## 5 Go Sedan 24.6 512.
## 6 Premier Sedan 24.6 510.
## 7 Uber XL 24.4 502.
# Step 5: visualize results
library(ggplot2)
ggplot (data=busiest_vehicle,mapping=aes(x=Vehicle.Type,y=Avg_ride_dist))+
geom_col(color='black',fill='lightblue', position='stack')+
labs(title='Busiest Vehicle Type by Booking and Ride Distance')+
theme(axis.text.x = element_text(angle=45,hjust = 1))

# What is the cancellation rate by customers versus drivers across different pickup locations?
#step1: convert to numeric
uber <- uber %>%
mutate(
Cancelled.Rides.by.Customer=as.numeric(Cancelled.Rides.by.Customer),
Cancelled.Rides.by.Driver=as.numeric(Cancelled.Rides.by.Driver)
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Cancelled.Rides.by.Customer =
## as.numeric(Cancelled.Rides.by.Customer)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
#Step2: calculate rate
cancellation_rate <- uber %>%
group_by(Pickup.Location)%>%
summarise(
total_rides=n(),
total_customers=(sum(Cancelled.Rides.by.Customer,na.rm=TRUE)/total_rides)*100,
total_drivers=(sum(Cancelled.Rides.by.Driver,na.rm=TRUE)/total_rides)*100
) %>%
slice(1:10)
# Step 3: print results
print(cancellation_rate)
## # A tibble: 10 × 4
## Pickup.Location total_rides total_customers total_drivers
## <chr> <int> <dbl> <dbl>
## 1 AIIMS 918 6.54 19.2
## 2 Adarsh Nagar 858 7.23 16.4
## 3 Akshardham 839 9.30 20.3
## 4 Ambience Mall 873 6.64 17.3
## 5 Anand Vihar 859 6.64 16.5
## 6 Anand Vihar ISBT 831 8.30 16.8
## 7 Ardee City 861 7.43 16.7
## 8 Arjangarh 810 9.14 16.7
## 9 Ashok Park Main 884 6.56 17.1
## 10 Ashok Vihar 796 6.28 18.0
# How do average VTAT (vehicle turnaround time) & CTAT (customer turnaround
# time) vary by vehicle type?
# Step1: convert to numeric
uber <- uber %>%
mutate(
Avg.VTAT = as.numeric(Avg.VTAT),
Avg.CTAT = as.numeric(Avg.CTAT)
)
## Warning: There were 2 warnings in `mutate()`.
## The first warning was:
## ℹ In argument: `Avg.VTAT = as.numeric(Avg.VTAT)`.
## Caused by warning:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
# Step2: aggregate vehical stats
vehicle_stats <- uber %>%
group_by (Vehicle.Type) %>%
summarise(
Average_vtat = mean(Avg.VTAT,na.rm=TRUE),
Average_ctat = mean(Avg.CTAT,na.rm=TRUE)
)
# Step3: print results
print(vehicle_stats)
## # A tibble: 7 × 3
## Vehicle.Type Average_vtat Average_ctat
## <chr> <dbl> <dbl>
## 1 Auto 8.45 29.1
## 2 Bike 8.50 29.2
## 3 Go Mini 8.47 29.2
## 4 Go Sedan 8.40 29.0
## 5 Premier Sedan 8.44 29.2
## 6 Uber XL 8.58 29.2
## 7 eBike 8.48 29.2
# What are the top three most common reasons customers cancel rides?
# Step1: aggregate cancel reasons
cancelled_rides <- uber %>%
filter(!is.na(Reason.for.cancelling.by.Customer))%>%
group_by (Reason.for.cancelling.by.Customer) %>%
summarise(count=n(),.groups='drop')
# Step2: print results
print(cancelled_rides)
## # A tibble: 6 × 2
## Reason.for.cancelling.by.Customer count
## <chr> <int>
## 1 AC is not working 1155
## 2 Change of plans 2353
## 3 Driver asked to cancel 2295
## 4 Driver is not moving towards pickup location 2335
## 5 Wrong Address 2362
## 6 null 139500
# Step3: visualize results
ggplot(data=cancelled_rides,mapping=aes(x=Reason.for.cancelling.by.Customer,y=count))+
geom_col(color='black',fill='green',)+
labs(title='top three most common reasons customers cancel rides')

# Do incomplete rides occur more frequently in certain pickup locations?
# step1: aggregate and group by pick up location
location_status <- uber %>%
group_by (Pickup.Location)%>%
summarise(count=n(),.groups='drop')%>%
arrange(desc(count))%>%
slice(1:10)
# step2: print results
print(location_status)
## # A tibble: 10 × 2
## Pickup.Location count
## <chr> <int>
## 1 Khandsa 949
## 2 Barakhamba Road 946
## 3 Saket 931
## 4 Badarpur 921
## 5 Pragati Maidan 920
## 6 Madipur 919
## 7 AIIMS 918
## 8 Mehrauli 915
## 9 Dwarka Sector 21 914
## 10 Pataudi Chowk 907
#How does booking value correlate with ride distance across all bookings?
# calculate correlation
corr_result <- cor(uber$Booking.Value,uber$Ride.Distance,use='complete.obs')
# print results
print(corr_result)
## [1] 0.005173977
# Which payment method is used most often by vehicle type?
# Aggregate and group by vehicle type
popular_payment <- uber %>%
group_by (Vehicle.Type)%>%
summarise(count=n(),.groups='drop')%>%
arrange(desc(count))
# print results
print (popular_payment)
## # A tibble: 7 × 2
## Vehicle.Type count
## <chr> <int>
## 1 Auto 37419
## 2 Go Mini 29806
## 3 Go Sedan 27141
## 4 Bike 22517
## 5 Premier Sedan 18111
## 6 eBike 10557
## 7 Uber XL 4449
#visualize results
ggplot(data=popular_payment,mapping=aes(x=Vehicle.Type,y=count))+
geom_col(color='black',fill='orange')+
labs(title='Payment Type by Vehicle Type')

# What time of day sees the highest booking cancellations by drivers?
busiest_time <- uber %>%
group_by(Time) %>%
summarise(cancel_total=sum(Cancelled.Rides.by.Driver,na.rm=TRUE),.groups='drop')%>%
arrange(desc(cancel_total))%>%
slice(1:5)
# print results
print (busiest_time)
## # A tibble: 5 × 2
## Time cancel_total
## <chr> <dbl>
## 1 18:40:56 6
## 2 09:46:34 5
## 3 10:36:44 5
## 4 19:30:47 5
## 5 08:15:05 4
# visualize results
ggplot(data=busiest_time,mapping=aes(x=Time,y=cancel_total,size=cancel_total))+
geom_point()+
labs(title='Time with the highest booking cancellations')

#What are the top most common reasons customers cancel rides?
cancel_reasons <- uber %>%
group_by (Reason.for.cancelling.by.Customer)%>%
summarise(count=n(),.groups='drop')%>%
arrange(desc(count))
# print results
print(cancel_reasons)
## # A tibble: 6 × 2
## Reason.for.cancelling.by.Customer count
## <chr> <int>
## 1 null 139500
## 2 Wrong Address 2362
## 3 Change of plans 2353
## 4 Driver is not moving towards pickup location 2335
## 5 Driver asked to cancel 2295
## 6 AC is not working 1155