setwd("c:/Users/Vinamra Jain/Downloads/Case18_UberEats_Dynamic_Pricing")
library(tidyverse) #Importing Library
#We first tried this approach but here we found out about the timestamp and datetime and how they don't match cause temperature is taken hourly.
'orders <- read_csv("order_transaction_log.csv")
weather <- read_csv("external_conditions.csv")
weather <- weather %>%
rename(timestamp = datetime)
master_table <- orders %>%
left_join(weather,by = c("city","timestamp"))
head(master_table)
mismatches <- master_table %>%
filter(rainfall_mm.x != rainfall_mm.y)
nrow(mismatches)
master_table <- master_table %>%
select(-rainfall_mm.y) %>%
rename( rainfall = rainfall_mm.x)
restaurants <- read_csv("restaurant_master.csv")
master_table <- master_table %>%
left_join(restaurants,by = "restaurant_id")
glimpse(master_table)'
## [1] "orders <- read_csv(\"order_transaction_log.csv\")\nweather <- read_csv(\"external_conditions.csv\")\n\nweather <- weather %>% \n rename(timestamp = datetime)\n\nmaster_table <- orders %>%\n left_join(weather,by = c(\"city\",\"timestamp\"))\n\nhead(master_table)\n\nmismatches <- master_table %>%\n filter(rainfall_mm.x != rainfall_mm.y)\n\nnrow(mismatches)\n\nmaster_table <- master_table %>%\n select(-rainfall_mm.y) %>%\n rename( rainfall = rainfall_mm.x)\n\nrestaurants <- read_csv(\"restaurant_master.csv\")\n\nmaster_table <- master_table %>%\n left_join(restaurants,by = \"restaurant_id\")\n\nglimpse(master_table)"
#Loading the 3 CSVs
orders <- read_csv("order_transaction_log.csv")
## Rows: 25000 Columns: 11
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): order_id, customer_id, restaurant_id, device_type, city
## dbl (5): delivery_fee_total, total_order_amount, rainfall_mm, base_delivery_fee, surge_compo...
## dttm (1): timestamp
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weather <- read_csv("external_conditions.csv")
## Rows: 30744 Columns: 5
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): city, day_type
## dbl (2): rainfall_mm, temperature_c
## dttm (1): datetime
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
restaurants <- read_csv("restaurant_master.csv")
## Rows: 400 Columns: 8
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): restaurant_id, cuisine_type, price_tier, chain_name, postcode_prefix, restaurant_name
## dbl (1): rating
## lgl (1): chain_flag
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Renaming datetime to timestamp to join
weather <- weather %>%
rename(timestamp = datetime)
#Rounding off so that we don't get NAs unnecessarily as Temperature is taken hourly and data points are same, also negligible data loss
orders <- orders %>%
mutate(join_time = floor_date(timestamp,"hour"))
weather <- weather %>%
mutate(join_time = floor_date(timestamp,"hour"))
#Combining the CSVs through unique keys
master_table <- orders %>%
left_join(weather, by = c("city","join_time")) %>%
left_join(restaurants, by = "restaurant_id")
glimpse(master_table)
## Rows: 25,000
## Columns: 23
## $ order_id [3m[38;5;246m<chr>[39m[23m "UEO00000001"[38;5;246m, [39m"UEO00000002"[38;5;246m, [39m"UEO00000003"[38;5;246m, [39m"UEO00000004"[38;5;246m, [39m"UEO0000…
## $ customer_id [3m[38;5;246m<chr>[39m[23m "USR008907"[38;5;246m, [39m"USR000848"[38;5;246m, [39m"USR008870"[38;5;246m, [39m"USR002340"[38;5;246m, [39m"USR004713"[38;5;246m, [39m"US…
## $ restaurant_id [3m[38;5;246m<chr>[39m[23m "UE00277"[38;5;246m, [39m"UE00386"[38;5;246m, [39m"UE00212"[38;5;246m, [39m"UE00187"[38;5;246m, [39m"UE00356"[38;5;246m, [39m"UE00020"[38;5;246m, [39m"U…
## $ delivery_fee_total [3m[38;5;246m<dbl>[39m[23m 4.17[38;5;246m, [39m1.14[38;5;246m, [39m2.64[38;5;246m, [39m3.74[38;5;246m, [39m1.00[38;5;246m, [39m2.45[38;5;246m, [39m2.92[38;5;246m, [39m2.17[38;5;246m, [39m4.10[38;5;246m, [39m1.29[38;5;246m, [39m3.52[38;5;246m, [39m6.…
## $ total_order_amount [3m[38;5;246m<dbl>[39m[23m 29.34[38;5;246m, [39m54.49[38;5;246m, [39m35.58[38;5;246m, [39m34.24[38;5;246m, [39m27.44[38;5;246m, [39m21.91[38;5;246m, [39m18.80[38;5;246m, [39m37.22[38;5;246m, [39m34.13[38;5;246m, [39m56.13…
## $ timestamp.x [3m[38;5;246m<dttm>[39m[23m 2023-04-12 13:04:00[38;5;246m, [39m2023-08-10 23:56:00[38;5;246m, [39m2023-09-17 11:19:00[38;5;246m, [39m2023…
## $ device_type [3m[38;5;246m<chr>[39m[23m "iOS"[38;5;246m, [39m"iOS"[38;5;246m, [39m"Android"[38;5;246m, [39m"Android"[38;5;246m, [39m"iOS"[38;5;246m, [39m"iOS"[38;5;246m, [39m"Web"[38;5;246m, [39m"Android"[38;5;246m, [39m…
## $ city [3m[38;5;246m<chr>[39m[23m "Edinburgh"[38;5;246m, [39m"Edinburgh"[38;5;246m, [39m"Leeds"[38;5;246m, [39m"Leeds"[38;5;246m, [39m"London"[38;5;246m, [39m"Leeds"[38;5;246m, [39m"Edin…
## $ rainfall_mm.x [3m[38;5;246m<dbl>[39m[23m 0.03[38;5;246m, [39m0.18[38;5;246m, [39m0.01[38;5;246m, [39m0.42[38;5;246m, [39m0.02[38;5;246m, [39m0.04[38;5;246m, [39m0.11[38;5;246m, [39m0.10[38;5;246m, [39m0.03[38;5;246m, [39m0.02[38;5;246m, [39m0.35[38;5;246m, [39m0.…
## $ base_delivery_fee [3m[38;5;246m<dbl>[39m[23m 2.49[38;5;246m, [39m0.99[38;5;246m, [39m1.99[38;5;246m, [39m2.49[38;5;246m, [39m0.99[38;5;246m, [39m1.49[38;5;246m, [39m1.99[38;5;246m, [39m1.49[38;5;246m, [39m2.49[38;5;246m, [39m0.99[38;5;246m, [39m2.49[38;5;246m, [39m2.…
## $ surge_component [3m[38;5;246m<dbl>[39m[23m 1.68[38;5;246m, [39m0.15[38;5;246m, [39m0.65[38;5;246m, [39m1.25[38;5;246m, [39m0.01[38;5;246m, [39m0.96[38;5;246m, [39m0.93[38;5;246m, [39m0.68[38;5;246m, [39m1.61[38;5;246m, [39m0.30[38;5;246m, [39m1.03[38;5;246m, [39m3.…
## $ join_time [3m[38;5;246m<dttm>[39m[23m 2023-04-12 13:00:00[38;5;246m, [39m2023-08-10 23:00:00[38;5;246m, [39m2023-09-17 11:00:00[38;5;246m, [39m2023…
## $ timestamp.y [3m[38;5;246m<dttm>[39m[23m 2023-04-12 13:00:00[38;5;246m, [39m2023-08-10 23:00:00[38;5;246m, [39m2023-09-17 11:00:00[38;5;246m, [39m2023…
## $ rainfall_mm.y [3m[38;5;246m<dbl>[39m[23m 0.03[38;5;246m, [39m0.18[38;5;246m, [39m0.01[38;5;246m, [39m0.42[38;5;246m, [39m0.02[38;5;246m, [39m0.04[38;5;246m, [39m0.11[38;5;246m, [39m0.10[38;5;246m, [39m0.03[38;5;246m, [39m0.02[38;5;246m, [39m0.35[38;5;246m, [39m0.…
## $ temperature_c [3m[38;5;246m<dbl>[39m[23m 18.1[38;5;246m, [39m15.9[38;5;246m, [39m11.6[38;5;246m, [39m16.2[38;5;246m, [39m7.3[38;5;246m, [39m11.0[38;5;246m, [39m19.9[38;5;246m, [39m16.4[38;5;246m, [39m6.0[38;5;246m, [39m22.0[38;5;246m, [39m13.2[38;5;246m, [39m18.1…
## $ day_type [3m[38;5;246m<chr>[39m[23m "weekday"[38;5;246m, [39m"weekday"[38;5;246m, [39m"weekend"[38;5;246m, [39m"weekend"[38;5;246m, [39m"weekday"[38;5;246m, [39m"weekday"[38;5;246m, [39m"w…
## $ cuisine_type [3m[38;5;246m<chr>[39m[23m "Mexican"[38;5;246m, [39m"Pizza"[38;5;246m, [39m"Burgers"[38;5;246m, [39m"Burgers"[38;5;246m, [39m"Chinese"[38;5;246m, [39m"Burgers"[38;5;246m, [39m"Chi…
## $ price_tier [3m[38;5;246m<chr>[39m[23m "Mid-Range"[38;5;246m, [39m"Mid-Range"[38;5;246m, [39m"Mid-Range"[38;5;246m, [39m"Budget"[38;5;246m, [39m"Premium"[38;5;246m, [39m"Mid-Ran…
## $ rating [3m[38;5;246m<dbl>[39m[23m 4.0[38;5;246m, [39m2.9[38;5;246m, [39m3.1[38;5;246m, [39m4.9[38;5;246m, [39m4.4[38;5;246m, [39m4.8[38;5;246m, [39m4.0[38;5;246m, [39m4.8[38;5;246m, [39m2.6[38;5;246m, [39m3.3[38;5;246m, [39m2.6[38;5;246m, [39m5.0[38;5;246m, [39m3.4[38;5;246m, [39m3.7…
## $ chain_flag [3m[38;5;246m<lgl>[39m[23m FALSE[38;5;246m, [39mTRUE[38;5;246m, [39mTRUE[38;5;246m, [39mTRUE[38;5;246m, [39mFALSE[38;5;246m, [39mFALSE[38;5;246m, [39mFALSE[38;5;246m, [39mTRUE[38;5;246m, [39mTRUE[38;5;246m, [39mFALSE[38;5;246m, [39mTRU…
## $ chain_name [3m[38;5;246m<chr>[39m[23m [31mNA[39m[38;5;246m, [39m"Chain_3"[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m"Chain_23"[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m"Chain_11"[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m"…
## $ postcode_prefix [3m[38;5;246m<chr>[39m[23m "LM2"[38;5;246m, [39m"FZ1"[38;5;246m, [39m"EL6"[38;5;246m, [39m"QJ7"[38;5;246m, [39m"QI2"[38;5;246m, [39m"KL6"[38;5;246m, [39m"XL2"[38;5;246m, [39m"RM1"[38;5;246m, [39m"JF5"[38;5;246m, [39m"RD5"…
## $ restaurant_name [3m[38;5;246m<chr>[39m[23m "Local Restaurant UE00277"[38;5;246m, [39m"Chain_3 36"[38;5;246m, [39m"Local Restaurant UE00212"…
#Dropping the duplicates and renaming as when duplicates are made they are saved as .x and .y
master_table <- master_table %>%
select(-timestamp.y,-rainfall_mm.y,-join_time) %>%
rename(rainfall_mm = rainfall_mm.x , timestamp = timestamp.x)
#We want to make a graph to check hourly demand for that we are making this new table, grouping by hour of day and summarizing by total order
hourly_demand <- master_table %>%
mutate(hour_of_day = hour(timestamp)) %>%
group_by(hour_of_day) %>%
summarise(total_orders = n() )
glimpse(hourly_demand)
## Rows: 14
## Columns: 2
## $ hour_of_day [3m[38;5;246m<int>[39m[23m 10[38;5;246m, [39m11[38;5;246m, [39m12[38;5;246m, [39m13[38;5;246m, [39m14[38;5;246m, [39m15[38;5;246m, [39m16[38;5;246m, [39m17[38;5;246m, [39m18[38;5;246m, [39m19[38;5;246m, [39m20[38;5;246m, [39m21[38;5;246m, [39m22[38;5;246m, [39m23
## $ total_orders [3m[38;5;246m<int>[39m[23m 1792[38;5;246m, [39m1789[38;5;246m, [39m1814[38;5;246m, [39m1770[38;5;246m, [39m1793[38;5;246m, [39m1763[38;5;246m, [39m1814[38;5;246m, [39m1847[38;5;246m, [39m1807[38;5;246m, [39m1836[38;5;246m, [39m1756[38;5;246m, [39m1715[38;5;246m, [39m17…
#this is the line graph we plotted why line graph because they are standard for time series
ggplot(hourly_demand, aes(x=hour_of_day, y=total_orders))+
geom_line(color = "steelblue", linewidth= 1.2) +
geom_point(color = "navy", size = 3) +
labs(title = "Baseline Demand : Orders by Hour of the Day",
subtitle = "Identifying peak ordering time across all conditions",
x = "Hour of the Day (0 = Midnight , 23 = 11 PM)",
y = "Total Volume of Orders")+
theme_minimal() +
scale_x_continuous(breaks = 0:23)

#Now we want to check average order demand on hourly basis in clear weather and rainy weather to check teh affect of weather
weather_summary <- master_table %>%
mutate(hour_block = floor_date(timestamp,"hour"),
weather_status = ifelse(rainfall_mm > 0 ,"Rainy","Clear")) %>%
group_by(hour_block,weather_status) %>%
summarise(order_in_hour = n(), groups = "drop") %>%
group_by(weather_status) %>%
summarise(avg_orders_per_hour = mean(order_in_hour))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by hour_block and weather_status.
## ℹ Output is grouped by hour_block.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(hour_block, weather_status))` for per-operation grouping
## (`?dplyr::dplyr_by`) instead.
#We used Bar Graph plot to show the difference between the hourly average order in clear weather and rainy weather
ggplot(weather_summary, aes(x = weather_status, y = avg_orders_per_hour , fill = weather_status)) +
geom_col(width = 0.5) +
labs(title = "The Weather Effect : Average Hourly Demand",
subtitle = "Comparing average order volume on Clear vs Rainy Days",
x = "Weather Condition",
y = "Average Orders per Hour")+
theme_minimal() +
scale_fill_manual(values = c("Clear" = "orange", "Rainy" = "blue"))+
theme(legend.position = "none")

#We now want to see till what surge price are people willing to pay the money
elasticity_data <- master_table %>%
mutate(rounded_surge = round(surge_component,1)) %>%
group_by(rounded_surge) %>%
summarise(total_orders = n(), .groups = "drop") %>%
filter(rounded_surge > 0 & rounded_surge <= 3.0)
#We ploted line graph as it's continuous data
ggplot(elasticity_data , aes(x = rounded_surge , y = total_orders)) +
geom_point(color = "purple",size = 4) +
geom_line(color = "purple",linetype = "dashed" , linewidth = 1) +
labs(title = "The Elasticity Curve : Demand Drop-off",
subtitle = "Tracking total completed orders against applied surge multipliers",
x = "Surge Component",
y = "Total Completed Orders")+
theme_minimal()

#Let's add rest of the CSVs
delivery <- read_csv("delivery_context.csv")
## Rows: 25000 Columns: 4
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): order_id, order_type
## dbl (2): estimated_distance_km, estimated_delivery_minutes
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rules <- read_csv("platform_pricing_rules.csv")
## Rows: 8 Columns: 7
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): fee_tier, distance_band_km, surge_trigger, notes
## dbl (2): base_fee_gbp, surge_per_unit_gbp
## date (1): effective_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Exploring data
print(rules)
## # A tibble: 8 × 7
## effective_date fee_tier distance_band_km base_fee_gbp surge_trigger surge_per_unit_gbp notes
## <date> <chr> <chr> <dbl> <chr> <dbl> <chr>
## 1 2023-01-01 tier_1 0-2.0 0.99 driver_availabi… 0.5 Base…
## 2 2023-01-01 tier_2 2.0-5.0 1.49 driver_availabi… 0.5 Mid …
## 3 2023-01-01 tier_3 5.0-8.0 1.99 driver_availabi… 0.5 Long…
## 4 2023-01-01 tier_4 8.0+ 2.49 driver_availabi… 0.5 Very…
## 5 2023-06-01 tier_1 0-2.0 0.99 driver_availabi… 0.6 Upda…
## 6 2023-06-01 tier_2 2.0-5.0 1.49 driver_availabi… 0.6 Upda…
## 7 2023-06-01 tier_3 5.0-8.0 1.99 driver_availabi… 0.6 Upda…
## 8 2023-06-01 tier_4 8.0+ 2.49 driver_availabi… 0.6 Upda…
#Exploring data
print(delivery)
## # A tibble: 25,000 × 4
## order_id estimated_distance_km estimated_delivery_minutes order_type
## <chr> <dbl> <dbl> <chr>
## 1 UEO00000001 5.97 16 Standard
## 2 UEO00000002 NA 43 Standard
## 3 UEO00000003 7.47 55 Standard
## 4 UEO00000004 3.78 55 Standard
## 5 UEO00000005 0.39 26 Standard
## 6 UEO00000006 6.18 25 Standard
## 7 UEO00000007 2.85 23 Standard
## 8 UEO00000008 5.38 48 Standard
## 9 UEO00000009 2.63 42 Standard
## 10 UEO00000010 NA 18 Standard
## # ℹ 24,990 more rows
#To check type wise data if any NA is present or not
delivery %>%
count(order_type)
## # A tibble: 3 × 2
## order_type n
## <chr> <int>
## 1 Near Me 4454
## 2 Scheduled 3003
## 3 Standard 17543
#Added to combined Master Table
master_table <- master_table %>%
left_join(delivery, by = "order_id")
#Tried this but was not correct
'master_table <- master_table %>%
mutate(
rule_distance_band = case_when(
estimated_distance_km <= 2.0 ~ "0-2.0",
estimated_distance_km <= 5.0 ~ "2.0-5.0",
estimated_distance_km <= 8.0 ~ "5.0-8.0",
TRUE ~ "8.0+"
),
rule_active_date = as.Date(ifelse(timestamp < as.Date("2023-06-01"),
"2023-01-01",
"2023-06-01"))
)
master_table <- master_table %>%
left_join(rules, by = c("rule_distance_band" = "distance_band_km",
"rule_active_date" = "effective_date")) %>%
select(-rule_distance_band, -rule_active_date)'
## [1] "master_table <- master_table %>%\n mutate(\n rule_distance_band = case_when(\n estimated_distance_km <= 2.0 ~ \"0-2.0\",\n estimated_distance_km <= 5.0 ~ \"2.0-5.0\",\n estimated_distance_km <= 8.0 ~ \"5.0-8.0\",\n TRUE ~ \"8.0+\" \n ),\n \n rule_active_date = as.Date(ifelse(timestamp < as.Date(\"2023-06-01\"), \n \"2023-01-01\", \n \"2023-06-01\"))\n )\n\nmaster_table <- master_table %>%\n left_join(rules, by = c(\"rule_distance_band\" = \"distance_band_km\", \n \"rule_active_date\" = \"effective_date\")) %>%\n select(-rule_distance_band, -rule_active_date)"
glimpse(master_table)
## Rows: 25,000
## Columns: 23
## $ order_id <chr> "UEO00000001", "UEO00000002", "UEO00000003", "UEO00000004", …
## $ customer_id <chr> "USR008907", "USR000848", "USR008870", "USR002340", "USR0047…
## $ restaurant_id <chr> "UE00277", "UE00386", "UE00212", "UE00187", "UE00356", "UE00…
## $ delivery_fee_total <dbl> 4.17, 1.14, 2.64, 3.74, 1.00, 2.45, 2.92, 2.17, 4.10, 1.29, …
## $ total_order_amount <dbl> 29.34, 54.49, 35.58, 34.24, 27.44, 21.91, 18.80, 37.22, 34.1…
## $ timestamp <dttm> 2023-04-12 13:04:00, 2023-08-10 23:56:00, 2023-09-17 11:19:…
## $ device_type <chr> "iOS", "iOS", "Android", "Android", "iOS", "iOS", "Web", "An…
## $ city <chr> "Edinburgh", "Edinburgh", "Leeds", "Leeds", "London", "Leeds…
## $ rainfall_mm <dbl> 0.03, 0.18, 0.01, 0.42, 0.02, 0.04, 0.11, 0.10, 0.03, 0.02, …
## $ base_delivery_fee <dbl> 2.49, 0.99, 1.99, 2.49, 0.99, 1.49, 1.99, 1.49, 2.49, 0.99, …
## $ surge_component <dbl> 1.68, 0.15, 0.65, 1.25, 0.01, 0.96, 0.93, 0.68, 1.61, 0.30, …
## $ temperature_c <dbl> 18.1, 15.9, 11.6, 16.2, 7.3, 11.0, 19.9, 16.4, 6.0, 22.0, 13…
## $ day_type <chr> "weekday", "weekday", "weekend", "weekend", "weekday", "week…
## $ cuisine_type <chr> "Mexican", "Pizza", "Burgers", "Burgers", "Chinese", "Burger…
## $ price_tier <chr> "Mid-Range", "Mid-Range", "Mid-Range", "Budget", "Premium", …
## $ rating <dbl> 4.0, 2.9, 3.1, 4.9, 4.4, 4.8, 4.0, 4.8, 2.6, 3.3, 2.6, 5.0, …
## $ chain_flag <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ chain_name <chr> NA, "Chain_3", NA, NA, "Chain_23", NA, "Chain_11", NA, NA, N…
## $ postcode_prefix <chr> "LM2", "FZ1", "EL6", "QJ7", "QI2", "KL6", "XL2", "RM1", "JF5…
## $ restaurant_name <chr> "Local Restaurant UE00277", "Chain_3 36", "Local Restaurant …
## $ estimated_distance_km <dbl> 5.97, NA, 7.47, 3.78, 0.39, 6.18, 2.85, 5.38, 2.63, NA, 4.11…
## $ estimated_delivery_minutes <dbl> 16, 43, 55, 55, 26, 25, 23, 48, 42, 18, 22, 49, 31, 17, 57, …
## $ order_type <chr> "Standard", "Standard", "Standard", "Standard", "Standard", …
#checking out the number of NA values in estimated_distance_km
master_table %>%
filter(is.na(estimated_distance_km)) %>%
count()
## # A tibble: 1 × 1
## n
## <int>
## 1 2874
#Adding median of estimated_distance_km order type wise value so that we can fill the NA values appropriately
master_table <- master_table %>%
group_by(order_type) %>%
mutate(
estimated_distance_km = ifelse(is.na(estimated_distance_km),
median(estimated_distance_km, na.rm = TRUE),
estimated_distance_km)
) %>%
ungroup()
#Converting character data type to dbl for calculation
master_table <- master_table %>%
mutate(
rule_distance_band = case_when(
estimated_distance_km <= 2.0 ~ "0-2.0",
estimated_distance_km <= 5.0 ~ "2.0-5.0",
estimated_distance_km <= 8.0 ~ "5.0-8.0",
TRUE ~ "8.0+"
),
rule_active_date = as.Date(ifelse(timestamp < as.Date("2023-06-01"),
"2023-01-01",
"2023-06-01"))
)
#renaming the new made columns
master_table <- master_table %>%
left_join(rules, by = c("rule_distance_band" = "distance_band_km",
"rule_active_date" = "effective_date")) %>%
select(-rule_distance_band, -rule_active_date)
#Checking number of NA values after filling
master_table %>% filter(is.na(estimated_distance_km)) %>% count()
## # A tibble: 1 × 1
## n
## <int>
## 1 0
glimpse(master_table)
## Rows: 25,000
## Columns: 28
## $ order_id <chr> "UEO00000001", "UEO00000002", "UEO00000003", "UEO00000004", …
## $ customer_id <chr> "USR008907", "USR000848", "USR008870", "USR002340", "USR0047…
## $ restaurant_id <chr> "UE00277", "UE00386", "UE00212", "UE00187", "UE00356", "UE00…
## $ delivery_fee_total <dbl> 4.17, 1.14, 2.64, 3.74, 1.00, 2.45, 2.92, 2.17, 4.10, 1.29, …
## $ total_order_amount <dbl> 29.34, 54.49, 35.58, 34.24, 27.44, 21.91, 18.80, 37.22, 34.1…
## $ timestamp <dttm> 2023-04-12 13:04:00, 2023-08-10 23:56:00, 2023-09-17 11:19:…
## $ device_type <chr> "iOS", "iOS", "Android", "Android", "iOS", "iOS", "Web", "An…
## $ city <chr> "Edinburgh", "Edinburgh", "Leeds", "Leeds", "London", "Leeds…
## $ rainfall_mm <dbl> 0.03, 0.18, 0.01, 0.42, 0.02, 0.04, 0.11, 0.10, 0.03, 0.02, …
## $ base_delivery_fee <dbl> 2.49, 0.99, 1.99, 2.49, 0.99, 1.49, 1.99, 1.49, 2.49, 0.99, …
## $ surge_component <dbl> 1.68, 0.15, 0.65, 1.25, 0.01, 0.96, 0.93, 0.68, 1.61, 0.30, …
## $ temperature_c <dbl> 18.1, 15.9, 11.6, 16.2, 7.3, 11.0, 19.9, 16.4, 6.0, 22.0, 13…
## $ day_type <chr> "weekday", "weekday", "weekend", "weekend", "weekday", "week…
## $ cuisine_type <chr> "Mexican", "Pizza", "Burgers", "Burgers", "Chinese", "Burger…
## $ price_tier <chr> "Mid-Range", "Mid-Range", "Mid-Range", "Budget", "Premium", …
## $ rating <dbl> 4.0, 2.9, 3.1, 4.9, 4.4, 4.8, 4.0, 4.8, 2.6, 3.3, 2.6, 5.0, …
## $ chain_flag <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ chain_name <chr> NA, "Chain_3", NA, NA, "Chain_23", NA, "Chain_11", NA, NA, N…
## $ postcode_prefix <chr> "LM2", "FZ1", "EL6", "QJ7", "QI2", "KL6", "XL2", "RM1", "JF5…
## $ restaurant_name <chr> "Local Restaurant UE00277", "Chain_3 36", "Local Restaurant …
## $ estimated_distance_km <dbl> 5.97, 4.14, 7.47, 3.78, 0.39, 6.18, 2.85, 5.38, 2.63, 4.14, …
## $ estimated_delivery_minutes <dbl> 16, 43, 55, 55, 26, 25, 23, 48, 42, 18, 22, 49, 31, 17, 57, …
## $ order_type <chr> "Standard", "Standard", "Standard", "Standard", "Standard", …
## $ fee_tier <chr> "tier_3", "tier_2", "tier_3", "tier_2", "tier_1", "tier_3", …
## $ base_fee_gbp <dbl> 1.99, 1.49, 1.99, 1.49, 0.99, 1.99, 1.49, 1.99, 1.49, 1.49, …
## $ surge_trigger <chr> "driver_availability<5", "driver_availability<4", "driver_av…
## $ surge_per_unit_gbp <dbl> 0.5, 0.6, 0.6, 0.5, 0.5, 0.5, 0.6, 0.5, 0.5, 0.6, 0.5, 0.6, …
## $ notes <chr> "Long distance", "Updated surge threshold Jun 2023", "Update…
#dropping any remaing .y files and adding order_hour
master_table <- master_table %>%
select(-ends_with(".y")) %>%
mutate(
order_hour = hour(timestamp)
)
glimpse(master_table)
## Rows: 25,000
## Columns: 29
## $ order_id <chr> "UEO00000001", "UEO00000002", "UEO00000003", "UEO00000004", …
## $ customer_id <chr> "USR008907", "USR000848", "USR008870", "USR002340", "USR0047…
## $ restaurant_id <chr> "UE00277", "UE00386", "UE00212", "UE00187", "UE00356", "UE00…
## $ delivery_fee_total <dbl> 4.17, 1.14, 2.64, 3.74, 1.00, 2.45, 2.92, 2.17, 4.10, 1.29, …
## $ total_order_amount <dbl> 29.34, 54.49, 35.58, 34.24, 27.44, 21.91, 18.80, 37.22, 34.1…
## $ timestamp <dttm> 2023-04-12 13:04:00, 2023-08-10 23:56:00, 2023-09-17 11:19:…
## $ device_type <chr> "iOS", "iOS", "Android", "Android", "iOS", "iOS", "Web", "An…
## $ city <chr> "Edinburgh", "Edinburgh", "Leeds", "Leeds", "London", "Leeds…
## $ rainfall_mm <dbl> 0.03, 0.18, 0.01, 0.42, 0.02, 0.04, 0.11, 0.10, 0.03, 0.02, …
## $ base_delivery_fee <dbl> 2.49, 0.99, 1.99, 2.49, 0.99, 1.49, 1.99, 1.49, 2.49, 0.99, …
## $ surge_component <dbl> 1.68, 0.15, 0.65, 1.25, 0.01, 0.96, 0.93, 0.68, 1.61, 0.30, …
## $ temperature_c <dbl> 18.1, 15.9, 11.6, 16.2, 7.3, 11.0, 19.9, 16.4, 6.0, 22.0, 13…
## $ day_type <chr> "weekday", "weekday", "weekend", "weekend", "weekday", "week…
## $ cuisine_type <chr> "Mexican", "Pizza", "Burgers", "Burgers", "Chinese", "Burger…
## $ price_tier <chr> "Mid-Range", "Mid-Range", "Mid-Range", "Budget", "Premium", …
## $ rating <dbl> 4.0, 2.9, 3.1, 4.9, 4.4, 4.8, 4.0, 4.8, 2.6, 3.3, 2.6, 5.0, …
## $ chain_flag <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ chain_name <chr> NA, "Chain_3", NA, NA, "Chain_23", NA, "Chain_11", NA, NA, N…
## $ postcode_prefix <chr> "LM2", "FZ1", "EL6", "QJ7", "QI2", "KL6", "XL2", "RM1", "JF5…
## $ restaurant_name <chr> "Local Restaurant UE00277", "Chain_3 36", "Local Restaurant …
## $ estimated_distance_km <dbl> 5.97, 4.14, 7.47, 3.78, 0.39, 6.18, 2.85, 5.38, 2.63, 4.14, …
## $ estimated_delivery_minutes <dbl> 16, 43, 55, 55, 26, 25, 23, 48, 42, 18, 22, 49, 31, 17, 57, …
## $ order_type <chr> "Standard", "Standard", "Standard", "Standard", "Standard", …
## $ fee_tier <chr> "tier_3", "tier_2", "tier_3", "tier_2", "tier_1", "tier_3", …
## $ base_fee_gbp <dbl> 1.99, 1.49, 1.99, 1.49, 0.99, 1.99, 1.49, 1.99, 1.49, 1.49, …
## $ surge_trigger <chr> "driver_availability<5", "driver_availability<4", "driver_av…
## $ surge_per_unit_gbp <dbl> 0.5, 0.6, 0.6, 0.5, 0.5, 0.5, 0.6, 0.5, 0.5, 0.6, 0.5, 0.6, …
## $ notes <chr> "Long distance", "Updated surge threshold Jun 2023", "Update…
## $ order_hour <int> 13, 23, 11, 12, 14, 14, 11, 23, 10, 14, 21, 21, 20, 19, 16, …
#Taking Data Points for correlation matrix
numeric_vars <- master_table %>%
select(
surge_component,
estimated_distance_km,
base_fee_gbp,
temperature_c,
rainfall_mm,
order_hour
)
cor_matrix <- cor(numeric_vars)
#plotting correlation matrix for feature selection in our ML models
cor_data <- as.data.frame(cor_matrix) %>%
mutate(Var1 = rownames(.)) %>%
pivot_longer(cols = -Var1, names_to = "Var2", values_to = "value")
ggplot(cor_data, aes(Var1, Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "#2C7BB6", high = "#D7191C", mid = "white",
midpoint = 0, limit = c(-1,1)) +
geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
labs(title = "Feature Correlation Matrix",
subtitle = "Justifying our predictive features for the ML model",
x = "", y = "", fill = "Correlation") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Checking Surge component to see how many outliers are there and check if there is a need of ml model or not, we used box plot as easy to check outliers
ggplot(master_table, aes(x = order_type, y = surge_component, fill = order_type)) +
geom_boxplot(alpha = 0.7, outlier.color = "#FF4B2B", outlier.alpha = 0.4) +
scale_fill_manual(values = c("Near Me" = "#06C167", "Scheduled" = "#555555", "Standard" = "#000000")) +
labs(title = "Surge Component Distribution by Order Type",
subtitle = "Visualizing price volatility across service categories",
x = "Service Type",
y = "Surge Component (£)") +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(face = "bold", size = 14),
panel.grid.minor = element_blank())

#Making the daypart column
master_table <- master_table %>%
mutate(
order_hour = hour(timestamp),
daypart = case_when(
order_hour >= 7 & order_hour < 11 ~ "Breakfast",
order_hour >= 11 & order_hour < 15 ~ "Lunch",
order_hour >= 17 & order_hour < 21 ~ "Dinner",
TRUE ~ "Off-Peak"
)
)
#checking cusine impact
cuisine_impact <- master_table %>%
group_by(cuisine_type) %>%
summarise(
avg_surge = mean(surge_component, na.rm = TRUE),
max_surge = max(surge_component, na.rm = TRUE),
total_orders = n()
) %>%
arrange(desc(avg_surge))
print(cuisine_impact)
## # A tibble: 10 × 4
## cuisine_type avg_surge max_surge total_orders
## <chr> <dbl> <dbl> <int>
## 1 Burgers 1.21 4.47 2700
## 2 Thai 1.21 4.49 2929
## 3 Desserts 1.19 4.5 2546
## 4 Chinese 1.19 4.47 2411
## 5 Indian 1.19 4.49 2228
## 6 Mexican 1.19 4.38 1857
## 7 Healthy 1.19 4.27 1848
## 8 Sushi 1.19 4.5 3046
## 9 Pizza 1.18 4.3 3064
## 10 Italian 1.18 4.49 2371
#made a dynamic price table that is rule based making 1.5 pound as the hard cap anything above it will not be safe
pricing_rule_table <- master_table %>%
group_by(cuisine_type, daypart) %>%
summarise(
standard_surge = mean(surge_component[rainfall_mm == 0], na.rm = TRUE),
rainy_weather_boost = mean(surge_component[rainfall_mm > 0], na.rm = TRUE) /
mean(surge_component[rainfall_mm == 0], na.rm = TRUE),
is_safe = ifelse(standard_surge * rainy_weather_boost <= 1.5, "YES", "ADJUST LOWER"),
.groups = "drop"
) %>%
#
mutate(across(where(is.numeric), ~round(., 2)))
print("--- DYNAMIC PRICING RULE TABLE (CASE 18) ---")
## [1] "--- DYNAMIC PRICING RULE TABLE (CASE 18) ---"
print(pricing_rule_table)
## # A tibble: 40 × 5
## cuisine_type daypart standard_surge rainy_weather_boost is_safe
## <chr> <chr> <dbl> <dbl> <chr>
## 1 Burgers Breakfast 1.25 0.96 YES
## 2 Burgers Dinner 1.36 0.87 YES
## 3 Burgers Lunch 1.24 0.98 YES
## 4 Burgers Off-Peak 1.4 0.87 YES
## 5 Chinese Breakfast 1.05 1.11 YES
## 6 Chinese Dinner 1.1 1.09 YES
## 7 Chinese Lunch 0.97 1.26 YES
## 8 Chinese Off-Peak 1.06 1.1 YES
## 9 Desserts Breakfast 1.99 0.54 YES
## 10 Desserts Dinner 1.1 1.08 YES
## # ℹ 30 more rows
#Importing libraries for ml modelling
library(randomForest)
library(Metrics)
library(rpart)
library(gbm)
#Data for ML model
ml_data <- master_table %>%
select(
surge_component,
estimated_distance_km,
order_hour,
rainfall_mm,
temperature_c,
cuisine_type,
order_type
)
#Changing categorical data to binary so that we can do calculation, it's like one hot coding
ml_data <- ml_data %>%
mutate(across(where(is.character), as.factor))
#Fixing seed so that result is reproducable
set.seed(42)
#Taking 80% of data as training data and 20% as testing data
train_index <- sample(1:nrow(ml_data), 0.8 * nrow(ml_data))
train_set <- ml_data[train_index, ]
test_set <- ml_data[-train_index, ]
#Trying Different models
cat("Training Linear Model...\n")
## Training Linear Model...
lm_model <- lm(surge_component ~ ., data = train_set)
lm_predictions <- predict(lm_model, test_set)
lm_mae <- mae(test_set$surge_component, lm_predictions)
cat("Training Random Forest Model...\n")
## Training Random Forest Model...
rf_model <- randomForest(surge_component ~ ., data = train_set, ntree = 100, importance = TRUE)
rf_predictions <- predict(rf_model, test_set)
rf_mae <- mae(test_set$surge_component, rf_predictions)
cat("Training Decision Tree...\n")
## Training Decision Tree...
tree_model <- rpart(surge_component ~ ., data = train_set)
tree_predictions <- predict(tree_model, test_set)
tree_mae <- mae(test_set$surge_component, tree_predictions)
cat("Training Gradient Boosting Machine (GBM)...\n")
## Training Gradient Boosting Machine (GBM)...
gbm_model <- gbm(surge_component ~ ., data = train_set, distribution = "gaussian",
n.trees = 100, interaction.depth = 3, verbose = FALSE)
gbm_predictions <- predict(gbm_model, test_set, n.trees = 100)
gbm_mae <- mae(test_set$surge_component, gbm_predictions)
#Mean absolute error of all different types of model
final_results <- data.frame(
Model_Type = c("Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting"),
Algorithm_Family = c("Parametric", "Tree-Based", "Ensemble", "Ensemble"),
Interpretability = c("High", "High", "Medium", "Low"),
MAE_GBP = c(round(lm_mae, 3), round(tree_mae, 3), round(rf_mae, 3), round(gbm_mae, 3))
) %>%
arrange(MAE_GBP)
print("--- MODEL TOURNAMENT RESULTS ---")
## [1] "--- MODEL TOURNAMENT RESULTS ---"
print(final_results)
## Model_Type Algorithm_Family Interpretability MAE_GBP
## 1 Decision Tree Tree-Based High 0.623
## 2 Gradient Boosting Ensemble Low 0.624
## 3 Linear Regression Parametric High 0.628
## 4 Random Forest Ensemble Medium 0.633
#Checking Importance Scores of different features
tree_scores <- as.data.frame(tree_model$variable.importance)
colnames(tree_scores) <- "Importance_Score"
print("--- DECISION TREE: FEATURE IMPORTANCE SCORES ---")
## [1] "--- DECISION TREE: FEATURE IMPORTANCE SCORES ---"
print(tree_scores)
## Importance_Score
## rainfall_mm 290.773632
## temperature_c 1.038477
#checking the logic of decision tree
library(rpart.plot)
rpart.plot(
tree_model,
type = 3,
clip.right.labs = FALSE,
branch = .3,
under = TRUE,
box.palette = "Gn",
main = "Uber Eats Dynamic Pricing Decision Tree"
)
