setwd("c:/Users/Vinamra Jain/Downloads/Case18_UberEats_Dynamic_Pricing")
library(tidyverse) #Importing Library

#We first tried this approach but here we found out about the timestamp and datetime and how they don't match cause temperature is taken hourly.
'orders <- read_csv("order_transaction_log.csv")
weather <- read_csv("external_conditions.csv")

weather <- weather %>% 
  rename(timestamp = datetime)

master_table <- orders %>%
 left_join(weather,by = c("city","timestamp"))

head(master_table)

mismatches <- master_table %>%
  filter(rainfall_mm.x != rainfall_mm.y)

nrow(mismatches)

master_table <- master_table %>%
  select(-rainfall_mm.y) %>%
  rename( rainfall = rainfall_mm.x)

restaurants <- read_csv("restaurant_master.csv")

master_table <- master_table %>%
  left_join(restaurants,by = "restaurant_id")

glimpse(master_table)'
## [1] "orders <- read_csv(\"order_transaction_log.csv\")\nweather <- read_csv(\"external_conditions.csv\")\n\nweather <- weather %>% \n  rename(timestamp = datetime)\n\nmaster_table <- orders %>%\n left_join(weather,by = c(\"city\",\"timestamp\"))\n\nhead(master_table)\n\nmismatches <- master_table %>%\n  filter(rainfall_mm.x != rainfall_mm.y)\n\nnrow(mismatches)\n\nmaster_table <- master_table %>%\n  select(-rainfall_mm.y) %>%\n  rename( rainfall = rainfall_mm.x)\n\nrestaurants <- read_csv(\"restaurant_master.csv\")\n\nmaster_table <- master_table %>%\n  left_join(restaurants,by = \"restaurant_id\")\n\nglimpse(master_table)"
#Loading the 3 CSVs  
orders <- read_csv("order_transaction_log.csv")
## Rows: 25000 Columns: 11
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): order_id, customer_id, restaurant_id, device_type, city
## dbl  (5): delivery_fee_total, total_order_amount, rainfall_mm, base_delivery_fee, surge_compo...
## dttm (1): timestamp
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
weather <- read_csv("external_conditions.csv")
## Rows: 30744 Columns: 5
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): city, day_type
## dbl  (2): rainfall_mm, temperature_c
## dttm (1): datetime
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
restaurants <- read_csv("restaurant_master.csv")
## Rows: 400 Columns: 8
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): restaurant_id, cuisine_type, price_tier, chain_name, postcode_prefix, restaurant_name
## dbl (1): rating
## lgl (1): chain_flag
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Renaming datetime to timestamp to join
weather <- weather %>%
  rename(timestamp = datetime)

#Rounding off so that we don't get NAs unnecessarily as Temperature is taken hourly and data points are same, also negligible data loss 
orders <- orders %>%
  mutate(join_time = floor_date(timestamp,"hour"))

weather <- weather %>%
  mutate(join_time = floor_date(timestamp,"hour"))

#Combining the CSVs through unique keys
master_table <- orders %>%
  left_join(weather, by = c("city","join_time")) %>%
  
  left_join(restaurants, by = "restaurant_id") 

glimpse(master_table)
## Rows: 25,000
## Columns: 23
## $ order_id           [3m[38;5;246m<chr>[39m[23m "UEO00000001"[38;5;246m, [39m"UEO00000002"[38;5;246m, [39m"UEO00000003"[38;5;246m, [39m"UEO00000004"[38;5;246m, [39m"UEO0000…
## $ customer_id        [3m[38;5;246m<chr>[39m[23m "USR008907"[38;5;246m, [39m"USR000848"[38;5;246m, [39m"USR008870"[38;5;246m, [39m"USR002340"[38;5;246m, [39m"USR004713"[38;5;246m, [39m"US…
## $ restaurant_id      [3m[38;5;246m<chr>[39m[23m "UE00277"[38;5;246m, [39m"UE00386"[38;5;246m, [39m"UE00212"[38;5;246m, [39m"UE00187"[38;5;246m, [39m"UE00356"[38;5;246m, [39m"UE00020"[38;5;246m, [39m"U…
## $ delivery_fee_total [3m[38;5;246m<dbl>[39m[23m 4.17[38;5;246m, [39m1.14[38;5;246m, [39m2.64[38;5;246m, [39m3.74[38;5;246m, [39m1.00[38;5;246m, [39m2.45[38;5;246m, [39m2.92[38;5;246m, [39m2.17[38;5;246m, [39m4.10[38;5;246m, [39m1.29[38;5;246m, [39m3.52[38;5;246m, [39m6.…
## $ total_order_amount [3m[38;5;246m<dbl>[39m[23m 29.34[38;5;246m, [39m54.49[38;5;246m, [39m35.58[38;5;246m, [39m34.24[38;5;246m, [39m27.44[38;5;246m, [39m21.91[38;5;246m, [39m18.80[38;5;246m, [39m37.22[38;5;246m, [39m34.13[38;5;246m, [39m56.13…
## $ timestamp.x        [3m[38;5;246m<dttm>[39m[23m 2023-04-12 13:04:00[38;5;246m, [39m2023-08-10 23:56:00[38;5;246m, [39m2023-09-17 11:19:00[38;5;246m, [39m2023…
## $ device_type        [3m[38;5;246m<chr>[39m[23m "iOS"[38;5;246m, [39m"iOS"[38;5;246m, [39m"Android"[38;5;246m, [39m"Android"[38;5;246m, [39m"iOS"[38;5;246m, [39m"iOS"[38;5;246m, [39m"Web"[38;5;246m, [39m"Android"[38;5;246m, [39m…
## $ city               [3m[38;5;246m<chr>[39m[23m "Edinburgh"[38;5;246m, [39m"Edinburgh"[38;5;246m, [39m"Leeds"[38;5;246m, [39m"Leeds"[38;5;246m, [39m"London"[38;5;246m, [39m"Leeds"[38;5;246m, [39m"Edin…
## $ rainfall_mm.x      [3m[38;5;246m<dbl>[39m[23m 0.03[38;5;246m, [39m0.18[38;5;246m, [39m0.01[38;5;246m, [39m0.42[38;5;246m, [39m0.02[38;5;246m, [39m0.04[38;5;246m, [39m0.11[38;5;246m, [39m0.10[38;5;246m, [39m0.03[38;5;246m, [39m0.02[38;5;246m, [39m0.35[38;5;246m, [39m0.…
## $ base_delivery_fee  [3m[38;5;246m<dbl>[39m[23m 2.49[38;5;246m, [39m0.99[38;5;246m, [39m1.99[38;5;246m, [39m2.49[38;5;246m, [39m0.99[38;5;246m, [39m1.49[38;5;246m, [39m1.99[38;5;246m, [39m1.49[38;5;246m, [39m2.49[38;5;246m, [39m0.99[38;5;246m, [39m2.49[38;5;246m, [39m2.…
## $ surge_component    [3m[38;5;246m<dbl>[39m[23m 1.68[38;5;246m, [39m0.15[38;5;246m, [39m0.65[38;5;246m, [39m1.25[38;5;246m, [39m0.01[38;5;246m, [39m0.96[38;5;246m, [39m0.93[38;5;246m, [39m0.68[38;5;246m, [39m1.61[38;5;246m, [39m0.30[38;5;246m, [39m1.03[38;5;246m, [39m3.…
## $ join_time          [3m[38;5;246m<dttm>[39m[23m 2023-04-12 13:00:00[38;5;246m, [39m2023-08-10 23:00:00[38;5;246m, [39m2023-09-17 11:00:00[38;5;246m, [39m2023…
## $ timestamp.y        [3m[38;5;246m<dttm>[39m[23m 2023-04-12 13:00:00[38;5;246m, [39m2023-08-10 23:00:00[38;5;246m, [39m2023-09-17 11:00:00[38;5;246m, [39m2023…
## $ rainfall_mm.y      [3m[38;5;246m<dbl>[39m[23m 0.03[38;5;246m, [39m0.18[38;5;246m, [39m0.01[38;5;246m, [39m0.42[38;5;246m, [39m0.02[38;5;246m, [39m0.04[38;5;246m, [39m0.11[38;5;246m, [39m0.10[38;5;246m, [39m0.03[38;5;246m, [39m0.02[38;5;246m, [39m0.35[38;5;246m, [39m0.…
## $ temperature_c      [3m[38;5;246m<dbl>[39m[23m 18.1[38;5;246m, [39m15.9[38;5;246m, [39m11.6[38;5;246m, [39m16.2[38;5;246m, [39m7.3[38;5;246m, [39m11.0[38;5;246m, [39m19.9[38;5;246m, [39m16.4[38;5;246m, [39m6.0[38;5;246m, [39m22.0[38;5;246m, [39m13.2[38;5;246m, [39m18.1…
## $ day_type           [3m[38;5;246m<chr>[39m[23m "weekday"[38;5;246m, [39m"weekday"[38;5;246m, [39m"weekend"[38;5;246m, [39m"weekend"[38;5;246m, [39m"weekday"[38;5;246m, [39m"weekday"[38;5;246m, [39m"w…
## $ cuisine_type       [3m[38;5;246m<chr>[39m[23m "Mexican"[38;5;246m, [39m"Pizza"[38;5;246m, [39m"Burgers"[38;5;246m, [39m"Burgers"[38;5;246m, [39m"Chinese"[38;5;246m, [39m"Burgers"[38;5;246m, [39m"Chi…
## $ price_tier         [3m[38;5;246m<chr>[39m[23m "Mid-Range"[38;5;246m, [39m"Mid-Range"[38;5;246m, [39m"Mid-Range"[38;5;246m, [39m"Budget"[38;5;246m, [39m"Premium"[38;5;246m, [39m"Mid-Ran…
## $ rating             [3m[38;5;246m<dbl>[39m[23m 4.0[38;5;246m, [39m2.9[38;5;246m, [39m3.1[38;5;246m, [39m4.9[38;5;246m, [39m4.4[38;5;246m, [39m4.8[38;5;246m, [39m4.0[38;5;246m, [39m4.8[38;5;246m, [39m2.6[38;5;246m, [39m3.3[38;5;246m, [39m2.6[38;5;246m, [39m5.0[38;5;246m, [39m3.4[38;5;246m, [39m3.7…
## $ chain_flag         [3m[38;5;246m<lgl>[39m[23m FALSE[38;5;246m, [39mTRUE[38;5;246m, [39mTRUE[38;5;246m, [39mTRUE[38;5;246m, [39mFALSE[38;5;246m, [39mFALSE[38;5;246m, [39mFALSE[38;5;246m, [39mTRUE[38;5;246m, [39mTRUE[38;5;246m, [39mFALSE[38;5;246m, [39mTRU…
## $ chain_name         [3m[38;5;246m<chr>[39m[23m [31mNA[39m[38;5;246m, [39m"Chain_3"[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m"Chain_23"[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m"Chain_11"[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m[31mNA[39m[38;5;246m, [39m"…
## $ postcode_prefix    [3m[38;5;246m<chr>[39m[23m "LM2"[38;5;246m, [39m"FZ1"[38;5;246m, [39m"EL6"[38;5;246m, [39m"QJ7"[38;5;246m, [39m"QI2"[38;5;246m, [39m"KL6"[38;5;246m, [39m"XL2"[38;5;246m, [39m"RM1"[38;5;246m, [39m"JF5"[38;5;246m, [39m"RD5"…
## $ restaurant_name    [3m[38;5;246m<chr>[39m[23m "Local Restaurant UE00277"[38;5;246m, [39m"Chain_3 36"[38;5;246m, [39m"Local Restaurant UE00212"…
#Dropping the duplicates and renaming as when duplicates are made they are saved as .x and .y
master_table <- master_table %>%
  select(-timestamp.y,-rainfall_mm.y,-join_time) %>%
  rename(rainfall_mm = rainfall_mm.x , timestamp = timestamp.x)

#We want to make a graph to check hourly demand for that we are making this new table, grouping by hour of day and summarizing by total order
hourly_demand <- master_table %>%
  mutate(hour_of_day = hour(timestamp)) %>%
  group_by(hour_of_day) %>%
  summarise(total_orders = n() )
glimpse(hourly_demand)
## Rows: 14
## Columns: 2
## $ hour_of_day  [3m[38;5;246m<int>[39m[23m 10[38;5;246m, [39m11[38;5;246m, [39m12[38;5;246m, [39m13[38;5;246m, [39m14[38;5;246m, [39m15[38;5;246m, [39m16[38;5;246m, [39m17[38;5;246m, [39m18[38;5;246m, [39m19[38;5;246m, [39m20[38;5;246m, [39m21[38;5;246m, [39m22[38;5;246m, [39m23
## $ total_orders [3m[38;5;246m<int>[39m[23m 1792[38;5;246m, [39m1789[38;5;246m, [39m1814[38;5;246m, [39m1770[38;5;246m, [39m1793[38;5;246m, [39m1763[38;5;246m, [39m1814[38;5;246m, [39m1847[38;5;246m, [39m1807[38;5;246m, [39m1836[38;5;246m, [39m1756[38;5;246m, [39m1715[38;5;246m, [39m17…
#this is the line graph we plotted why line graph because they are standard for time series
ggplot(hourly_demand, aes(x=hour_of_day, y=total_orders))+
  geom_line(color = "steelblue", linewidth= 1.2) +
  geom_point(color = "navy", size = 3) +
  labs(title = "Baseline Demand : Orders by Hour of the Day",
       subtitle = "Identifying peak ordering time across all conditions",
       x = "Hour of the Day (0 = Midnight , 23 = 11 PM)",
       y = "Total Volume of Orders")+
  theme_minimal() +
  scale_x_continuous(breaks = 0:23)

#Now we want to check average order demand on hourly basis in clear weather and rainy weather to check teh affect of weather
weather_summary <- master_table %>%
  mutate(hour_block = floor_date(timestamp,"hour"),
         weather_status = ifelse(rainfall_mm > 0 ,"Rainy","Clear")) %>%
  
  group_by(hour_block,weather_status) %>%
  summarise(order_in_hour = n(), groups = "drop") %>%
  
  group_by(weather_status) %>%
  summarise(avg_orders_per_hour = mean(order_in_hour))
## `summarise()` has regrouped the output.
## ℹ Summaries were computed grouped by hour_block and weather_status.
## ℹ Output is grouped by hour_block.
## ℹ Use `summarise(.groups = "drop_last")` to silence this message.
## ℹ Use `summarise(.by = c(hour_block, weather_status))` for per-operation grouping
##   (`?dplyr::dplyr_by`) instead.
#We used Bar Graph plot to show the difference between the hourly average order in clear weather and rainy weather
ggplot(weather_summary, aes(x = weather_status, y = avg_orders_per_hour , fill = weather_status)) +
  
  geom_col(width = 0.5) +
  labs(title = "The Weather Effect : Average Hourly Demand",
       subtitle = "Comparing average order volume on Clear vs Rainy Days",
       x = "Weather Condition",
       y = "Average Orders per Hour")+
  theme_minimal() +
  scale_fill_manual(values = c("Clear" = "orange", "Rainy" = "blue"))+
  theme(legend.position = "none")

#We now want to see till what surge price are people willing to pay the money
elasticity_data <- master_table %>%
  mutate(rounded_surge = round(surge_component,1)) %>% 
  group_by(rounded_surge) %>%
  summarise(total_orders = n(), .groups = "drop") %>%
  filter(rounded_surge > 0 & rounded_surge <= 3.0)

#We ploted line graph as it's continuous data 
ggplot(elasticity_data , aes(x = rounded_surge , y = total_orders)) +
  geom_point(color = "purple",size = 4) +
  geom_line(color = "purple",linetype = "dashed" , linewidth = 1) +
  labs(title = "The Elasticity Curve : Demand Drop-off",
       subtitle = "Tracking total completed orders against applied surge multipliers",
       x = "Surge Component",
       y = "Total Completed Orders")+
  theme_minimal()

#Let's add rest of the CSVs   
delivery <- read_csv("delivery_context.csv")
## Rows: 25000 Columns: 4
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): order_id, order_type
## dbl (2): estimated_distance_km, estimated_delivery_minutes
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
rules <- read_csv("platform_pricing_rules.csv")
## Rows: 8 Columns: 7
## ── Column specification ─────────────────────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (4): fee_tier, distance_band_km, surge_trigger, notes
## dbl  (2): base_fee_gbp, surge_per_unit_gbp
## date (1): effective_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Exploring data
print(rules)
## # A tibble: 8 × 7
##   effective_date fee_tier distance_band_km base_fee_gbp surge_trigger    surge_per_unit_gbp notes
##   <date>         <chr>    <chr>                   <dbl> <chr>                         <dbl> <chr>
## 1 2023-01-01     tier_1   0-2.0                    0.99 driver_availabi…                0.5 Base…
## 2 2023-01-01     tier_2   2.0-5.0                  1.49 driver_availabi…                0.5 Mid …
## 3 2023-01-01     tier_3   5.0-8.0                  1.99 driver_availabi…                0.5 Long…
## 4 2023-01-01     tier_4   8.0+                     2.49 driver_availabi…                0.5 Very…
## 5 2023-06-01     tier_1   0-2.0                    0.99 driver_availabi…                0.6 Upda…
## 6 2023-06-01     tier_2   2.0-5.0                  1.49 driver_availabi…                0.6 Upda…
## 7 2023-06-01     tier_3   5.0-8.0                  1.99 driver_availabi…                0.6 Upda…
## 8 2023-06-01     tier_4   8.0+                     2.49 driver_availabi…                0.6 Upda…
#Exploring data
print(delivery)
## # A tibble: 25,000 × 4
##    order_id    estimated_distance_km estimated_delivery_minutes order_type
##    <chr>                       <dbl>                      <dbl> <chr>     
##  1 UEO00000001                  5.97                         16 Standard  
##  2 UEO00000002                 NA                            43 Standard  
##  3 UEO00000003                  7.47                         55 Standard  
##  4 UEO00000004                  3.78                         55 Standard  
##  5 UEO00000005                  0.39                         26 Standard  
##  6 UEO00000006                  6.18                         25 Standard  
##  7 UEO00000007                  2.85                         23 Standard  
##  8 UEO00000008                  5.38                         48 Standard  
##  9 UEO00000009                  2.63                         42 Standard  
## 10 UEO00000010                 NA                            18 Standard  
## # ℹ 24,990 more rows
#To check type wise data if any NA is present or not
delivery %>%
  count(order_type)
## # A tibble: 3 × 2
##   order_type     n
##   <chr>      <int>
## 1 Near Me     4454
## 2 Scheduled   3003
## 3 Standard   17543
#Added to combined Master Table
master_table <- master_table %>%
  left_join(delivery, by = "order_id")

#Tried this but was not correct
'master_table <- master_table %>%
  mutate(
    rule_distance_band = case_when(
      estimated_distance_km <= 2.0 ~ "0-2.0",
      estimated_distance_km <= 5.0 ~ "2.0-5.0",
      estimated_distance_km <= 8.0 ~ "5.0-8.0",
      TRUE ~ "8.0+" 
    ),
    
    rule_active_date = as.Date(ifelse(timestamp < as.Date("2023-06-01"), 
                                      "2023-01-01", 
                                      "2023-06-01"))
  )

master_table <- master_table %>%
  left_join(rules, by = c("rule_distance_band" = "distance_band_km", 
                          "rule_active_date" = "effective_date")) %>%
  select(-rule_distance_band, -rule_active_date)'
## [1] "master_table <- master_table %>%\n  mutate(\n    rule_distance_band = case_when(\n      estimated_distance_km <= 2.0 ~ \"0-2.0\",\n      estimated_distance_km <= 5.0 ~ \"2.0-5.0\",\n      estimated_distance_km <= 8.0 ~ \"5.0-8.0\",\n      TRUE ~ \"8.0+\" \n    ),\n    \n    rule_active_date = as.Date(ifelse(timestamp < as.Date(\"2023-06-01\"), \n                                      \"2023-01-01\", \n                                      \"2023-06-01\"))\n  )\n\nmaster_table <- master_table %>%\n  left_join(rules, by = c(\"rule_distance_band\" = \"distance_band_km\", \n                          \"rule_active_date\" = \"effective_date\")) %>%\n  select(-rule_distance_band, -rule_active_date)"
glimpse(master_table)
## Rows: 25,000
## Columns: 23
## $ order_id                   <chr> "UEO00000001", "UEO00000002", "UEO00000003", "UEO00000004", …
## $ customer_id                <chr> "USR008907", "USR000848", "USR008870", "USR002340", "USR0047…
## $ restaurant_id              <chr> "UE00277", "UE00386", "UE00212", "UE00187", "UE00356", "UE00…
## $ delivery_fee_total         <dbl> 4.17, 1.14, 2.64, 3.74, 1.00, 2.45, 2.92, 2.17, 4.10, 1.29, …
## $ total_order_amount         <dbl> 29.34, 54.49, 35.58, 34.24, 27.44, 21.91, 18.80, 37.22, 34.1…
## $ timestamp                  <dttm> 2023-04-12 13:04:00, 2023-08-10 23:56:00, 2023-09-17 11:19:…
## $ device_type                <chr> "iOS", "iOS", "Android", "Android", "iOS", "iOS", "Web", "An…
## $ city                       <chr> "Edinburgh", "Edinburgh", "Leeds", "Leeds", "London", "Leeds…
## $ rainfall_mm                <dbl> 0.03, 0.18, 0.01, 0.42, 0.02, 0.04, 0.11, 0.10, 0.03, 0.02, …
## $ base_delivery_fee          <dbl> 2.49, 0.99, 1.99, 2.49, 0.99, 1.49, 1.99, 1.49, 2.49, 0.99, …
## $ surge_component            <dbl> 1.68, 0.15, 0.65, 1.25, 0.01, 0.96, 0.93, 0.68, 1.61, 0.30, …
## $ temperature_c              <dbl> 18.1, 15.9, 11.6, 16.2, 7.3, 11.0, 19.9, 16.4, 6.0, 22.0, 13…
## $ day_type                   <chr> "weekday", "weekday", "weekend", "weekend", "weekday", "week…
## $ cuisine_type               <chr> "Mexican", "Pizza", "Burgers", "Burgers", "Chinese", "Burger…
## $ price_tier                 <chr> "Mid-Range", "Mid-Range", "Mid-Range", "Budget", "Premium", …
## $ rating                     <dbl> 4.0, 2.9, 3.1, 4.9, 4.4, 4.8, 4.0, 4.8, 2.6, 3.3, 2.6, 5.0, …
## $ chain_flag                 <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ chain_name                 <chr> NA, "Chain_3", NA, NA, "Chain_23", NA, "Chain_11", NA, NA, N…
## $ postcode_prefix            <chr> "LM2", "FZ1", "EL6", "QJ7", "QI2", "KL6", "XL2", "RM1", "JF5…
## $ restaurant_name            <chr> "Local Restaurant UE00277", "Chain_3 36", "Local Restaurant …
## $ estimated_distance_km      <dbl> 5.97, NA, 7.47, 3.78, 0.39, 6.18, 2.85, 5.38, 2.63, NA, 4.11…
## $ estimated_delivery_minutes <dbl> 16, 43, 55, 55, 26, 25, 23, 48, 42, 18, 22, 49, 31, 17, 57, …
## $ order_type                 <chr> "Standard", "Standard", "Standard", "Standard", "Standard", …
#checking out the number of NA values in estimated_distance_km
master_table %>%
  filter(is.na(estimated_distance_km)) %>%
  count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1  2874
#Adding median of estimated_distance_km order type wise value so that we can fill the NA values appropriately
master_table <- master_table %>%
  group_by(order_type) %>%
  mutate(
    estimated_distance_km = ifelse(is.na(estimated_distance_km),
                                   median(estimated_distance_km, na.rm = TRUE),
                                   estimated_distance_km)
  ) %>%
  ungroup()

#Converting character data type to dbl for calculation
master_table <- master_table %>%
  mutate(
    rule_distance_band = case_when(
      estimated_distance_km <= 2.0 ~ "0-2.0",
      estimated_distance_km <= 5.0 ~ "2.0-5.0",
      estimated_distance_km <= 8.0 ~ "5.0-8.0",
      TRUE ~ "8.0+" 
    ),
    rule_active_date = as.Date(ifelse(timestamp < as.Date("2023-06-01"), 
                                      "2023-01-01", 
                                      "2023-06-01"))
  )

#renaming the new made columns
master_table <- master_table %>%
  left_join(rules, by = c("rule_distance_band" = "distance_band_km", 
                          "rule_active_date" = "effective_date")) %>%
  select(-rule_distance_band, -rule_active_date)

#Checking number of NA values after filling
master_table %>% filter(is.na(estimated_distance_km)) %>% count()
## # A tibble: 1 × 1
##       n
##   <int>
## 1     0
glimpse(master_table)
## Rows: 25,000
## Columns: 28
## $ order_id                   <chr> "UEO00000001", "UEO00000002", "UEO00000003", "UEO00000004", …
## $ customer_id                <chr> "USR008907", "USR000848", "USR008870", "USR002340", "USR0047…
## $ restaurant_id              <chr> "UE00277", "UE00386", "UE00212", "UE00187", "UE00356", "UE00…
## $ delivery_fee_total         <dbl> 4.17, 1.14, 2.64, 3.74, 1.00, 2.45, 2.92, 2.17, 4.10, 1.29, …
## $ total_order_amount         <dbl> 29.34, 54.49, 35.58, 34.24, 27.44, 21.91, 18.80, 37.22, 34.1…
## $ timestamp                  <dttm> 2023-04-12 13:04:00, 2023-08-10 23:56:00, 2023-09-17 11:19:…
## $ device_type                <chr> "iOS", "iOS", "Android", "Android", "iOS", "iOS", "Web", "An…
## $ city                       <chr> "Edinburgh", "Edinburgh", "Leeds", "Leeds", "London", "Leeds…
## $ rainfall_mm                <dbl> 0.03, 0.18, 0.01, 0.42, 0.02, 0.04, 0.11, 0.10, 0.03, 0.02, …
## $ base_delivery_fee          <dbl> 2.49, 0.99, 1.99, 2.49, 0.99, 1.49, 1.99, 1.49, 2.49, 0.99, …
## $ surge_component            <dbl> 1.68, 0.15, 0.65, 1.25, 0.01, 0.96, 0.93, 0.68, 1.61, 0.30, …
## $ temperature_c              <dbl> 18.1, 15.9, 11.6, 16.2, 7.3, 11.0, 19.9, 16.4, 6.0, 22.0, 13…
## $ day_type                   <chr> "weekday", "weekday", "weekend", "weekend", "weekday", "week…
## $ cuisine_type               <chr> "Mexican", "Pizza", "Burgers", "Burgers", "Chinese", "Burger…
## $ price_tier                 <chr> "Mid-Range", "Mid-Range", "Mid-Range", "Budget", "Premium", …
## $ rating                     <dbl> 4.0, 2.9, 3.1, 4.9, 4.4, 4.8, 4.0, 4.8, 2.6, 3.3, 2.6, 5.0, …
## $ chain_flag                 <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ chain_name                 <chr> NA, "Chain_3", NA, NA, "Chain_23", NA, "Chain_11", NA, NA, N…
## $ postcode_prefix            <chr> "LM2", "FZ1", "EL6", "QJ7", "QI2", "KL6", "XL2", "RM1", "JF5…
## $ restaurant_name            <chr> "Local Restaurant UE00277", "Chain_3 36", "Local Restaurant …
## $ estimated_distance_km      <dbl> 5.97, 4.14, 7.47, 3.78, 0.39, 6.18, 2.85, 5.38, 2.63, 4.14, …
## $ estimated_delivery_minutes <dbl> 16, 43, 55, 55, 26, 25, 23, 48, 42, 18, 22, 49, 31, 17, 57, …
## $ order_type                 <chr> "Standard", "Standard", "Standard", "Standard", "Standard", …
## $ fee_tier                   <chr> "tier_3", "tier_2", "tier_3", "tier_2", "tier_1", "tier_3", …
## $ base_fee_gbp               <dbl> 1.99, 1.49, 1.99, 1.49, 0.99, 1.99, 1.49, 1.99, 1.49, 1.49, …
## $ surge_trigger              <chr> "driver_availability<5", "driver_availability<4", "driver_av…
## $ surge_per_unit_gbp         <dbl> 0.5, 0.6, 0.6, 0.5, 0.5, 0.5, 0.6, 0.5, 0.5, 0.6, 0.5, 0.6, …
## $ notes                      <chr> "Long distance", "Updated surge threshold Jun 2023", "Update…
#dropping any remaing .y files and adding order_hour
master_table <- master_table %>%
  select(-ends_with(".y")) %>%
  mutate(
    order_hour = hour(timestamp) 
  )

glimpse(master_table)
## Rows: 25,000
## Columns: 29
## $ order_id                   <chr> "UEO00000001", "UEO00000002", "UEO00000003", "UEO00000004", …
## $ customer_id                <chr> "USR008907", "USR000848", "USR008870", "USR002340", "USR0047…
## $ restaurant_id              <chr> "UE00277", "UE00386", "UE00212", "UE00187", "UE00356", "UE00…
## $ delivery_fee_total         <dbl> 4.17, 1.14, 2.64, 3.74, 1.00, 2.45, 2.92, 2.17, 4.10, 1.29, …
## $ total_order_amount         <dbl> 29.34, 54.49, 35.58, 34.24, 27.44, 21.91, 18.80, 37.22, 34.1…
## $ timestamp                  <dttm> 2023-04-12 13:04:00, 2023-08-10 23:56:00, 2023-09-17 11:19:…
## $ device_type                <chr> "iOS", "iOS", "Android", "Android", "iOS", "iOS", "Web", "An…
## $ city                       <chr> "Edinburgh", "Edinburgh", "Leeds", "Leeds", "London", "Leeds…
## $ rainfall_mm                <dbl> 0.03, 0.18, 0.01, 0.42, 0.02, 0.04, 0.11, 0.10, 0.03, 0.02, …
## $ base_delivery_fee          <dbl> 2.49, 0.99, 1.99, 2.49, 0.99, 1.49, 1.99, 1.49, 2.49, 0.99, …
## $ surge_component            <dbl> 1.68, 0.15, 0.65, 1.25, 0.01, 0.96, 0.93, 0.68, 1.61, 0.30, …
## $ temperature_c              <dbl> 18.1, 15.9, 11.6, 16.2, 7.3, 11.0, 19.9, 16.4, 6.0, 22.0, 13…
## $ day_type                   <chr> "weekday", "weekday", "weekend", "weekend", "weekday", "week…
## $ cuisine_type               <chr> "Mexican", "Pizza", "Burgers", "Burgers", "Chinese", "Burger…
## $ price_tier                 <chr> "Mid-Range", "Mid-Range", "Mid-Range", "Budget", "Premium", …
## $ rating                     <dbl> 4.0, 2.9, 3.1, 4.9, 4.4, 4.8, 4.0, 4.8, 2.6, 3.3, 2.6, 5.0, …
## $ chain_flag                 <lgl> FALSE, TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE, FA…
## $ chain_name                 <chr> NA, "Chain_3", NA, NA, "Chain_23", NA, "Chain_11", NA, NA, N…
## $ postcode_prefix            <chr> "LM2", "FZ1", "EL6", "QJ7", "QI2", "KL6", "XL2", "RM1", "JF5…
## $ restaurant_name            <chr> "Local Restaurant UE00277", "Chain_3 36", "Local Restaurant …
## $ estimated_distance_km      <dbl> 5.97, 4.14, 7.47, 3.78, 0.39, 6.18, 2.85, 5.38, 2.63, 4.14, …
## $ estimated_delivery_minutes <dbl> 16, 43, 55, 55, 26, 25, 23, 48, 42, 18, 22, 49, 31, 17, 57, …
## $ order_type                 <chr> "Standard", "Standard", "Standard", "Standard", "Standard", …
## $ fee_tier                   <chr> "tier_3", "tier_2", "tier_3", "tier_2", "tier_1", "tier_3", …
## $ base_fee_gbp               <dbl> 1.99, 1.49, 1.99, 1.49, 0.99, 1.99, 1.49, 1.99, 1.49, 1.49, …
## $ surge_trigger              <chr> "driver_availability<5", "driver_availability<4", "driver_av…
## $ surge_per_unit_gbp         <dbl> 0.5, 0.6, 0.6, 0.5, 0.5, 0.5, 0.6, 0.5, 0.5, 0.6, 0.5, 0.6, …
## $ notes                      <chr> "Long distance", "Updated surge threshold Jun 2023", "Update…
## $ order_hour                 <int> 13, 23, 11, 12, 14, 14, 11, 23, 10, 14, 21, 21, 20, 19, 16, …
#Taking Data Points for correlation matrix 
numeric_vars <- master_table %>%
  select(
    surge_component, 
    estimated_distance_km, 
    base_fee_gbp, 
    temperature_c, 
    rainfall_mm, 
    order_hour
  )

cor_matrix <- cor(numeric_vars)

#plotting correlation matrix for feature selection in our ML models
cor_data <- as.data.frame(cor_matrix) %>%
  mutate(Var1 = rownames(.)) %>%
  pivot_longer(cols = -Var1, names_to = "Var2", values_to = "value")

ggplot(cor_data, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "#2C7BB6", high = "#D7191C", mid = "white", 
                       midpoint = 0, limit = c(-1,1)) +
  geom_text(aes(label = round(value, 2)), color = "black", size = 4) +
  labs(title = "Feature Correlation Matrix",
       subtitle = "Justifying our predictive features for the ML model",
       x = "", y = "", fill = "Correlation") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

#Checking Surge component to see how many outliers are there and check if there is a need of ml model or not, we used box plot as easy to check outliers
ggplot(master_table, aes(x = order_type, y = surge_component, fill = order_type)) +
  geom_boxplot(alpha = 0.7, outlier.color = "#FF4B2B", outlier.alpha = 0.4) +
  scale_fill_manual(values = c("Near Me" = "#06C167", "Scheduled" = "#555555", "Standard" = "#000000")) +
  labs(title = "Surge Component Distribution by Order Type",
       subtitle = "Visualizing price volatility across service categories",
       x = "Service Type",
       y = "Surge Component (£)") +
  theme_minimal() +
  theme(legend.position = "none",
        plot.title = element_text(face = "bold", size = 14),
        panel.grid.minor = element_blank())

#Making the daypart column
master_table <- master_table %>%
  mutate(
    order_hour = hour(timestamp),

    daypart = case_when(
      order_hour >= 7 & order_hour < 11 ~ "Breakfast",
      order_hour >= 11 & order_hour < 15 ~ "Lunch",
      order_hour >= 17 & order_hour < 21 ~ "Dinner",
      TRUE ~ "Off-Peak"
    )
  )

#checking cusine impact
cuisine_impact <- master_table %>%
  group_by(cuisine_type) %>%
  summarise(
    avg_surge = mean(surge_component, na.rm = TRUE),
    max_surge = max(surge_component, na.rm = TRUE),
    total_orders = n()
  ) %>%
  arrange(desc(avg_surge))

print(cuisine_impact)
## # A tibble: 10 × 4
##    cuisine_type avg_surge max_surge total_orders
##    <chr>            <dbl>     <dbl>        <int>
##  1 Burgers           1.21      4.47         2700
##  2 Thai              1.21      4.49         2929
##  3 Desserts          1.19      4.5          2546
##  4 Chinese           1.19      4.47         2411
##  5 Indian            1.19      4.49         2228
##  6 Mexican           1.19      4.38         1857
##  7 Healthy           1.19      4.27         1848
##  8 Sushi             1.19      4.5          3046
##  9 Pizza             1.18      4.3          3064
## 10 Italian           1.18      4.49         2371
#made a dynamic price table that is rule based making 1.5 pound as the hard cap anything above it will not be safe
pricing_rule_table <- master_table %>%
  group_by(cuisine_type, daypart) %>%
  summarise(
    
    standard_surge = mean(surge_component[rainfall_mm == 0], na.rm = TRUE),
    
    rainy_weather_boost = mean(surge_component[rainfall_mm > 0], na.rm = TRUE) / 
      mean(surge_component[rainfall_mm == 0], na.rm = TRUE),
    
    is_safe = ifelse(standard_surge * rainy_weather_boost <= 1.5, "YES", "ADJUST LOWER"),
    .groups = "drop"
  ) %>%
  # 
  mutate(across(where(is.numeric), ~round(., 2)))

print("--- DYNAMIC PRICING RULE TABLE (CASE 18) ---")
## [1] "--- DYNAMIC PRICING RULE TABLE (CASE 18) ---"
print(pricing_rule_table)
## # A tibble: 40 × 5
##    cuisine_type daypart   standard_surge rainy_weather_boost is_safe
##    <chr>        <chr>              <dbl>               <dbl> <chr>  
##  1 Burgers      Breakfast           1.25                0.96 YES    
##  2 Burgers      Dinner              1.36                0.87 YES    
##  3 Burgers      Lunch               1.24                0.98 YES    
##  4 Burgers      Off-Peak            1.4                 0.87 YES    
##  5 Chinese      Breakfast           1.05                1.11 YES    
##  6 Chinese      Dinner              1.1                 1.09 YES    
##  7 Chinese      Lunch               0.97                1.26 YES    
##  8 Chinese      Off-Peak            1.06                1.1  YES    
##  9 Desserts     Breakfast           1.99                0.54 YES    
## 10 Desserts     Dinner              1.1                 1.08 YES    
## # ℹ 30 more rows
#Importing libraries for ml modelling
library(randomForest)
library(Metrics)
library(rpart)
library(gbm)

#Data for ML model
ml_data <- master_table %>%
  select(
    surge_component,
    estimated_distance_km, 
    order_hour, 
    rainfall_mm, 
    temperature_c, 
    cuisine_type, 
    order_type
  )

#Changing categorical data to binary so that we can do calculation, it's like one hot coding
ml_data <- ml_data %>%
  mutate(across(where(is.character), as.factor))

#Fixing seed so that result is reproducable
set.seed(42)

#Taking 80% of data as training data and 20% as testing data
train_index <- sample(1:nrow(ml_data), 0.8 * nrow(ml_data))
train_set <- ml_data[train_index, ]
test_set <- ml_data[-train_index, ]

#Trying Different models
cat("Training Linear Model...\n")
## Training Linear Model...
lm_model <- lm(surge_component ~ ., data = train_set)
lm_predictions <- predict(lm_model, test_set)
lm_mae <- mae(test_set$surge_component, lm_predictions)

cat("Training Random Forest Model...\n")
## Training Random Forest Model...
rf_model <- randomForest(surge_component ~ ., data = train_set, ntree = 100, importance = TRUE)
rf_predictions <- predict(rf_model, test_set)
rf_mae <- mae(test_set$surge_component, rf_predictions)

cat("Training Decision Tree...\n")
## Training Decision Tree...
tree_model <- rpart(surge_component ~ ., data = train_set)
tree_predictions <- predict(tree_model, test_set)
tree_mae <- mae(test_set$surge_component, tree_predictions)

cat("Training Gradient Boosting Machine (GBM)...\n")
## Training Gradient Boosting Machine (GBM)...
gbm_model <- gbm(surge_component ~ ., data = train_set, distribution = "gaussian", 
                 n.trees = 100, interaction.depth = 3, verbose = FALSE)
gbm_predictions <- predict(gbm_model, test_set, n.trees = 100)
gbm_mae <- mae(test_set$surge_component, gbm_predictions)

#Mean absolute error of all different types of model
final_results <- data.frame(
  Model_Type = c("Linear Regression", "Decision Tree", "Random Forest", "Gradient Boosting"),
  Algorithm_Family = c("Parametric", "Tree-Based", "Ensemble", "Ensemble"),
  Interpretability = c("High", "High", "Medium", "Low"),
  MAE_GBP = c(round(lm_mae, 3), round(tree_mae, 3), round(rf_mae, 3), round(gbm_mae, 3))
) %>% 
  arrange(MAE_GBP)


print("--- MODEL TOURNAMENT RESULTS ---")
## [1] "--- MODEL TOURNAMENT RESULTS ---"
print(final_results)
##          Model_Type Algorithm_Family Interpretability MAE_GBP
## 1     Decision Tree       Tree-Based             High   0.623
## 2 Gradient Boosting         Ensemble              Low   0.624
## 3 Linear Regression       Parametric             High   0.628
## 4     Random Forest         Ensemble           Medium   0.633
#Checking Importance Scores of different features  
tree_scores <- as.data.frame(tree_model$variable.importance)
colnames(tree_scores) <- "Importance_Score"
print("--- DECISION TREE: FEATURE IMPORTANCE SCORES ---")
## [1] "--- DECISION TREE: FEATURE IMPORTANCE SCORES ---"
print(tree_scores)
##               Importance_Score
## rainfall_mm         290.773632
## temperature_c         1.038477
#checking the logic of decision tree
library(rpart.plot)

rpart.plot(
  tree_model, 
  type = 3,           
  clip.right.labs = FALSE, 
  branch = .3,        
  under = TRUE,       
  box.palette = "Gn", 
  main = "Uber Eats Dynamic Pricing Decision Tree"
)