1 PENDAHULUAN

LBB Regression Model ini menggunakan data pengiriman makanan yang terjadi di India. Ini adalah data layanan kurir dimana restoran, toko, atau perusahaan pengiriman makanan mengantarkan makanan ke pelanggan. Pemesanan biasanya dilakukan melalui situs web atau aplikasi mobile restoran atau penjual bahan makanan, atau melalui perusahaan pemesanan makanan. Barang yang dikirim dapat mencakup makanan pembuka, pendamping, minuman, makanan penutup, atau barang belanjaan dan biasanya dikirim dalam kotak atau tas. Pengantar biasanya akan mengendarai mobil, tetapi di kota-kota besar di mana rumah dan restoran lebih dekat satu sama lain, mereka dapat menggunakan sepeda atau skuter bermotor.

Kita akan menggunakan Model Regresi untuk memprediksi berapa waktu yang dibutuhkan untuk mengirim makanan.

2 SETUP

# clear-up the environment
rm(list = ls())

# scientific notation
options(scipen = 999)

# chunk options
knitr::opts_chunk$set(
  fig.align = "center",
  message = FALSE,
  warning = FALSE,
  comment = "#>"
)

Library yang digunakan :

library(ggplot2)
library(dplyr)
library(geosphere)
library(tidyverse)
library(GGally)
library(caret)

3 IMPORT DATA

delivery_food_1 <- read.csv("data/train.csv")
delivery_food_2 <- read.csv("data/test.csv")
delivery_food <- bind_rows(delivery_food_1,delivery_food_2)

4 CEK NA

anyNA(delivery_food)
#> [1] TRUE
missing_values <- colSums(is.na(delivery_food))
print(missing_values)
#>                          ID          Delivery_person_ID 
#>                           0                           0 
#>         Delivery_person_Age     Delivery_person_Ratings 
#>                        2345                        2415 
#>         Restaurant_latitude        Restaurant_longitude 
#>                           0                           0 
#>  Delivery_location_latitude Delivery_location_longitude 
#>                           0                           0 
#>                  Order_Date                 Time_Orderd 
#>                           0                           0 
#>           Time_Order_picked           Weatherconditions 
#>                           0                           0 
#>        Road_traffic_density           Vehicle_condition 
#>                           0                           0 
#>               Type_of_order             Type_of_vehicle 
#>                           0                           0 
#>         multiple_deliveries                    Festival 
#>                        1231                           0 
#>                        City             Time_taken.min. 
#>                           0                       11399

5 HAPUS NA

delivery_food_clean <- na.omit(delivery_food)

6 DATA WRANGLING

delivery_food_clean <- delivery_food_clean %>%
  mutate(
    Time_Orderd = as.POSIXct(Time_Orderd, format = "%H:%M:%S"),
    Time_Order_picked = as.POSIXct(Time_Order_picked, format = "%H:%M:%S"),
    Order_preparation = as.numeric(difftime(Time_Order_picked, Time_Orderd, units = "mins")),
    Distance = distHaversine(cbind(Restaurant_longitude, Restaurant_latitude), cbind(Delivery_location_longitude, Delivery_location_latitude)),
    Delivery_time = as.numeric(gsub("\\(min\\) ",'',Time_taken.min.)),
    Weatherconditions = as.factor(Weatherconditions),
    Road_traffic_density = as.factor(Road_traffic_density),
    Vehicle_condition = as.factor(Vehicle_condition),
    Type_of_order = as.factor(Type_of_order),
    Type_of_vehicle = as.factor(Type_of_vehicle),
    multiple_deliveries = as.numeric(multiple_deliveries),
    Festival = as.factor(Festival),
    City = as.factor(City)
    )

delivery_food_clean <- delivery_food_clean %>%
  dplyr::select(
    Delivery_person_Age,
    Delivery_person_Ratings,
    Weatherconditions,
    Road_traffic_density,
    Vehicle_condition,
    Type_of_order,
    Type_of_vehicle,
    multiple_deliveries,
    Festival,
    City,
    Order_preparation,
    Distance,
    Delivery_time
  )

delivery_food_clean <- na.omit(delivery_food_clean)

7 OUTLIER

Terdapat outlier pada kolom Distance dan Order_preparation.

boxplot(delivery_food_clean$Distance, horizontal = T)

boxplot(delivery_food_clean$Order_preparation, horizontal = T)

8 SUBSET

Kita akan menghilangkan Order_preparation yang bernilai negatif dan Distance di atas 1.000.000

delivery_food_clean <- delivery_food_clean[delivery_food_clean$Order_preparation > 0 & delivery_food_clean$Distance <= 1000000,]

9 EXPLANATORY DATA ANALYSIS

summary(delivery_food_clean)
#>  Delivery_person_Age Delivery_person_Ratings             Weatherconditions
#>  Min.   :20.00       Min.   :2.500           conditions Cloudy    :6972   
#>  1st Qu.:25.00       1st Qu.:4.500           conditions Fog       :7121   
#>  Median :30.00       Median :4.700           conditions NaN       :   0   
#>  Mean   :29.57       Mean   :4.635           conditions Sandstorms:6898   
#>  3rd Qu.:35.00       3rd Qu.:4.900           conditions Stormy    :7044   
#>  Max.   :39.00       Max.   :5.000           conditions Sunny     :6738   
#>                                              conditions Windy     :6929   
#>  Road_traffic_density Vehicle_condition Type_of_order  
#>  High   : 4194        0:13910           Buffet :10314  
#>  Jam    :13418        1:13859           Drinks :10410  
#>  Low    :13716        2:13933           Meal   :10475  
#>  Medium :10374        3:    0           Snack  :10503  
#>  NaN    :    0                                         
#>                                                        
#>                                                        
#>           Type_of_vehicle  multiple_deliveries Festival    
#>  bicycle          :    0   Min.   :0.0000      NaN :  201  
#>  electric_scooter : 3368   1st Qu.:0.0000      No  :40657  
#>  motorcycle       :24391   Median :1.0000      Yes :  844  
#>  scooter          :13943   Mean   :0.7456                  
#>                            3rd Qu.:1.0000                  
#>                            Max.   :3.0000                  
#>                                                            
#>              City       Order_preparation    Distance     Delivery_time  
#>  Metropolitian :31241   Min.   : 5.00     Min.   : 1467   Min.   :10.00  
#>  NaN           : 1063   1st Qu.: 5.00     1st Qu.: 4663   1st Qu.:19.00  
#>  Semi-Urban    :  152   Median :10.00     Median : 9229   Median :26.00  
#>  Urban         : 9246   Mean   : 9.96     Mean   : 9718   Mean   :26.45  
#>                         3rd Qu.:15.00     3rd Qu.:13697   3rd Qu.:33.00  
#>                         Max.   :15.00     Max.   :20993   Max.   :54.00  
#> 
str(delivery_food_clean)
#> 'data.frame':    41702 obs. of  13 variables:
#>  $ Delivery_person_Age    : num  37 34 23 38 32 22 33 35 22 36 ...
#>  $ Delivery_person_Ratings: num  4.9 4.5 4.4 4.7 4.6 4.8 4.7 4.6 4.8 4.2 ...
#>  $ Weatherconditions      : Factor w/ 7 levels "conditions Cloudy",..: 6 5 4 6 1 1 2 1 5 2 ...
#>  $ Road_traffic_density   : Factor w/ 5 levels "High ","Jam ",..: 1 2 3 4 1 2 2 4 2 2 ...
#>  $ Vehicle_condition      : Factor w/ 4 levels "0","1","2","3": 3 3 1 1 2 1 2 3 1 3 ...
#>  $ Type_of_order          : Factor w/ 4 levels "Buffet ","Drinks ",..: 4 4 2 1 4 1 3 3 1 4 ...
#>  $ Type_of_vehicle        : Factor w/ 4 levels "bicycle ","electric_scooter ",..: 3 4 3 3 4 3 4 3 3 3 ...
#>  $ multiple_deliveries    : num  0 1 1 1 1 1 1 1 1 3 ...
#>  $ Festival               : Factor w/ 3 levels "NaN ","No ","Yes ": 2 2 2 2 2 2 2 2 2 2 ...
#>  $ City                   : Factor w/ 4 levels "Metropolitian ",..: 4 1 4 1 1 4 1 1 1 1 ...
#>  $ Order_preparation      : num  15 5 15 10 15 10 15 5 10 15 ...
#>  $ Distance               : num  3029 20206 1554 7799 6217 ...
#>  $ Delivery_time          : num  24 33 26 21 30 26 40 32 34 46 ...
#>  - attr(*, "na.action")= 'omit' Named int [1:91] 2233 2713 2756 3350 4397 4833 5012 5284 5522 5985 ...
#>   ..- attr(*, "names")= chr [1:91] "2388" "2905" "2950" "3585" ...

Berikut adalah keterangan untuk setiap kolom : - Delivery_person_Age: Usia Kurir

- Delivery_person_Ratings: Rating Kurir

- Weatherconditions: Keadaan Cuaca

- Road_traffic_density: Keadaan Lalu Lintas

- Vehicle_condition: Keadaan Kendaraan

- Type_of_order: Jenis Pemesanan

- Type_of_vehicle: Jenis Kendaraaan

- multiple_deliveries: Pengiriman Multiple

- Festival : Apakah ada Festival

- City: Jenis Kota

- Order_preparation: Durasi Persiapan Paket

- Distance: Jarak Antara Restoran ke Tujuan

- Delivery_time: Waktu Pengiriman dari Restoran ke Tujuan

Berikut adalah korelasi antar variable:

ggcorr(delivery_food_clean, label = TRUE, label_size = 2.9, hjust = 1, layout.exp = 2)

Terlihat bahwa, variable yang paling mempengaruhi Delivery_time adalah multiple_deliveries, Distance, dan Delivert_person_Age

10 MODEL SINGLE REGRESI LINEAR

Kita akan membuat regresi linear dengan variable prediktor multiple_deliveries karena variable tersebut memiliki korelasi positif tertinggi dengan variable Delivery_time.

model_regresi_single <- lm(Delivery_time ~ multiple_deliveries, delivery_food_clean)
summary(model_regresi_single)
#> 
#> Call:
#> lm(formula = Delivery_time ~ multiple_deliveries, data = delivery_food_clean)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -18.070  -6.708  -0.708   5.292  32.292 
#> 
#> Coefficients:
#>                     Estimate Std. Error t value            Pr(>|t|)    
#> (Intercept)         21.70754    0.06946  312.51 <0.0000000000000002 ***
#> multiple_deliveries  6.36199    0.07383   86.17 <0.0000000000000002 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 8.651 on 41700 degrees of freedom
#> Multiple R-squared:  0.1511, Adjusted R-squared:  0.1511 
#> F-statistic:  7425 on 1 and 41700 DF,  p-value: < 0.00000000000000022
plot(delivery_food_clean$multiple_deliveries, delivery_food_clean$Delivery_time)
abline(model_regresi_single$coefficients[1],model_regresi_single$coefficients[2])

Dapat dilihat bahwa, adjusted R-squared hanya memiliki nilai 0.1511. Berarti variable multiple_deliveries tidak cukup untuk menjelaskan. Harus menambahkan variable lain untuk meningkatkan kualitas model.

11 STEPWISE FOR FEATURE SELECTION

Kita dapat menggunakan stepwise regression dengan metode backward untuk menentukan variable apa saja yang dapat meningkatkan kualitas model.

model_backward <- lm(Delivery_time ~ ., delivery_food_clean)
model_backward <- step(model_backward, direction = "backward")
#> Start:  AIC=146981
#> Delivery_time ~ Delivery_person_Age + Delivery_person_Ratings + 
#>     Weatherconditions + Road_traffic_density + Vehicle_condition + 
#>     Type_of_order + Type_of_vehicle + multiple_deliveries + Festival + 
#>     City + Order_preparation + Distance
#> 
#>                           Df Sum of Sq     RSS    AIC
#> - Type_of_order            3        58 1413604 146977
#> - Order_preparation        1         9 1413554 146979
#> - Type_of_vehicle          2        80 1413626 146979
#> <none>                                 1413546 146981
#> - City                     3     42401 1455947 148207
#> - Festival                 2     73881 1487427 149102
#> - multiple_deliveries      1    100301 1513847 149838
#> - Distance                 1    108181 1521727 150054
#> - Vehicle_condition        2    124630 1538176 150501
#> - Weatherconditions        5    180593 1594139 151985
#> - Delivery_person_Age      1    196896 1610442 152417
#> - Delivery_person_Ratings  1    203377 1616923 152585
#> - Road_traffic_density     3    335809 1749355 155864
#> 
#> Step:  AIC=146976.7
#> Delivery_time ~ Delivery_person_Age + Delivery_person_Ratings + 
#>     Weatherconditions + Road_traffic_density + Vehicle_condition + 
#>     Type_of_vehicle + multiple_deliveries + Festival + City + 
#>     Order_preparation + Distance
#> 
#>                           Df Sum of Sq     RSS    AIC
#> - Order_preparation        1         8 1413612 146975
#> - Type_of_vehicle          2        80 1413683 146975
#> <none>                                 1413604 146977
#> - City                     3     42420 1456024 148204
#> - Festival                 2     73907 1487510 149098
#> - multiple_deliveries      1    100264 1513868 149832
#> - Distance                 1    108203 1521807 150050
#> - Vehicle_condition        2    124658 1538262 150497
#> - Weatherconditions        5    180627 1594231 151981
#> - Delivery_person_Age      1    196913 1610517 152413
#> - Delivery_person_Ratings  1    203344 1616947 152579
#> - Road_traffic_density     3    335835 1749439 155860
#> 
#> Step:  AIC=146974.9
#> Delivery_time ~ Delivery_person_Age + Delivery_person_Ratings + 
#>     Weatherconditions + Road_traffic_density + Vehicle_condition + 
#>     Type_of_vehicle + multiple_deliveries + Festival + City + 
#>     Distance
#> 
#>                           Df Sum of Sq     RSS    AIC
#> - Type_of_vehicle          2        80 1413692 146973
#> <none>                                 1413612 146975
#> - City                     3     42422 1456034 148202
#> - Festival                 2     73912 1487524 149096
#> - multiple_deliveries      1    100267 1513879 149831
#> - Distance                 1    108216 1521828 150049
#> - Vehicle_condition        2    124672 1538284 150496
#> - Weatherconditions        5    180622 1594234 151979
#> - Delivery_person_Age      1    196937 1610549 152412
#> - Delivery_person_Ratings  1    203353 1616965 152578
#> - Road_traffic_density     3    335829 1749441 155858
#> 
#> Step:  AIC=146973.3
#> Delivery_time ~ Delivery_person_Age + Delivery_person_Ratings + 
#>     Weatherconditions + Road_traffic_density + Vehicle_condition + 
#>     multiple_deliveries + Festival + City + Distance
#> 
#>                           Df Sum of Sq     RSS    AIC
#> <none>                                 1413692 146973
#> - City                     3     42401 1456093 148200
#> - Festival                 2     73941 1487633 149095
#> - multiple_deliveries      1    100242 1513934 149828
#> - Distance                 1    108276 1521967 150049
#> - Weatherconditions        5    180619 1594310 151977
#> - Vehicle_condition        2    188818 1602510 152197
#> - Delivery_person_Age      1    196902 1610593 152409
#> - Delivery_person_Ratings  1    203434 1617126 152578
#> - Road_traffic_density     3    335782 1749474 155854
model_regresi_multiple <- lm(formula = Delivery_time ~ Delivery_person_Age + Delivery_person_Ratings + 
    Weatherconditions + Road_traffic_density + Vehicle_condition + 
    multiple_deliveries + Festival + City + log(Distance), data = delivery_food_clean)

summary(model_regresi_multiple)
#> 
#> Call:
#> lm(formula = Delivery_time ~ Delivery_person_Age + Delivery_person_Ratings + 
#>     Weatherconditions + Road_traffic_density + Vehicle_condition + 
#>     multiple_deliveries + Festival + City + log(Distance), data = delivery_food_clean)
#> 
#> Residuals:
#>     Min      1Q  Median      3Q     Max 
#> -19.434  -4.098  -0.171   3.825  28.375 
#> 
#> Coefficients:
#>                                         Estimate Std. Error t value
#> (Intercept)                            28.917671   0.761067  37.996
#> Delivery_person_Age                     0.381217   0.005078  75.071
#> Delivery_person_Ratings                -7.345587   0.094375 -77.834
#> Weatherconditionsconditions Fog         0.034173   0.099031   0.345
#> Weatherconditionsconditions Sandstorms -2.760842   0.100109 -27.578
#> Weatherconditionsconditions Stormy     -2.776258   0.099521 -27.896
#> Weatherconditionsconditions Sunny      -6.227119   0.100951 -61.685
#> Weatherconditionsconditions Windy      -2.660398   0.099927 -26.623
#> Road_traffic_densityJam                 0.602088   0.114402   5.263
#> Road_traffic_densityLow                -6.131794   0.105612 -58.060
#> Road_traffic_densityMedium             -2.391960   0.116122 -20.599
#> Vehicle_condition1                     -4.576217   0.071439 -64.058
#> Vehicle_condition2                     -4.554126   0.071289 -63.882
#> multiple_deliveries                     2.897467   0.052612  55.072
#> FestivalNo                              6.452198   0.417555  15.452
#> FestivalYes                            15.843985   0.468761  33.800
#> CityNaN                                -2.524589   0.183733 -13.741
#> CitySemi-Urban                          9.745056   0.482464  20.199
#> CityUrban                              -1.853113   0.070376 -26.332
#> log(Distance)                           2.211046   0.045274  48.837
#>                                                    Pr(>|t|)    
#> (Intercept)                            < 0.0000000000000002 ***
#> Delivery_person_Age                    < 0.0000000000000002 ***
#> Delivery_person_Ratings                < 0.0000000000000002 ***
#> Weatherconditionsconditions Fog                        0.73    
#> Weatherconditionsconditions Sandstorms < 0.0000000000000002 ***
#> Weatherconditionsconditions Stormy     < 0.0000000000000002 ***
#> Weatherconditionsconditions Sunny      < 0.0000000000000002 ***
#> Weatherconditionsconditions Windy      < 0.0000000000000002 ***
#> Road_traffic_densityJam                         0.000000142 ***
#> Road_traffic_densityLow                < 0.0000000000000002 ***
#> Road_traffic_densityMedium             < 0.0000000000000002 ***
#> Vehicle_condition1                     < 0.0000000000000002 ***
#> Vehicle_condition2                     < 0.0000000000000002 ***
#> multiple_deliveries                    < 0.0000000000000002 ***
#> FestivalNo                             < 0.0000000000000002 ***
#> FestivalYes                            < 0.0000000000000002 ***
#> CityNaN                                < 0.0000000000000002 ***
#> CitySemi-Urban                         < 0.0000000000000002 ***
#> CityUrban                              < 0.0000000000000002 ***
#> log(Distance)                          < 0.0000000000000002 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> Residual standard error: 5.877 on 41682 degrees of freedom
#> Multiple R-squared:  0.6084, Adjusted R-squared:  0.6083 
#> F-statistic:  3409 on 19 and 41682 DF,  p-value: < 0.00000000000000022

Metode step-wise regression ini akan menghasilkan formula optimum berdasarkan nilai AIC yang terendah, dimana semakin rendah nilai AIC tersebut, maka nilai observasi yang tidak tertangkap semakin kecil.

Bila dibandingkan dengan model awal yang hanya menggunakan variabel multiple_deliveries, model regresi yang menggunakan variabel prediktor Delivery_person_Age, Delivery_person_Ratings, Weatherconditionsconditions, Road_traffic_density, Vehicle_condition, multiple_deliveries, Festival, City, dan Distance memiliki R-squared 0.6083. Jauh lebih tinggi dibandingkan model sebelumnya yaitu 0.1511.

12 PREDICTION INTERVAL

range(delivery_food_clean$Delivery_time)
#> [1] 10 54
RMSE(pred = model_regresi_multiple$fitted.values, obs = delivery_food_clean$Delivery_time)
#> [1] 5.875458

Kesimpulan: RMSE cukup tinggi. Sekitar 11% dari rentang variable target. Artinya Model yang dihasilkan oleh stepwise selection pun belum cukup baik.

13 ASUMSI LINEAR REGRESSION

13.1 Linearity

plot(model_backward, which = 1)
abline(h = 10, col = "green")
abline(h = -10, col = "green")

Kesimpulan: karena garis merah masih ada di dalam cakupan toleransi kita, sehingga model_backward adalabh model yang linear.

13.2 Normality of Residuals

hist(model_backward$residuals)

ks.test(model_backward$residuals, "pnorm")
#> 
#>  Asymptotic one-sample Kolmogorov-Smirnov test
#> 
#> data:  model_backward$residuals
#> D = 0.35861, p-value < 0.00000000000000022
#> alternative hypothesis: two-sided

Kesimpulan: p-value yang lebih kecil dari 0.05 menunjukkan bahwa data tidak terdistribusi normal.

13.3 Homoscedasticity of Residuals

plot(x = model_backward$fitted.values, y = model_backward$residuals)
abline(h = 0, col = "red")

library(lmtest)
bptest(model_backward)
#> 
#>  studentized Breusch-Pagan test
#> 
#> data:  model_backward
#> BP = 1194.2, df = 19, p-value < 0.00000000000000022

Kesimpulan: karena nilai p-value dari BPtest < 0.05, sehingga model ini tidak memenuhi asumsi homoscedasticity of residual

13.4 No Multicollinearity

library(car)
vif(model_backward)
#>                             GVIF Df GVIF^(1/(2*Df))
#> Delivery_person_Age     1.035848  1        1.017766
#> Delivery_person_Ratings 1.053100  1        1.026206
#> Weatherconditions       1.025008  5        1.002473
#> Road_traffic_density    1.294919  3        1.044016
#> Vehicle_condition       1.033807  2        1.008347
#> multiple_deliveries     1.102213  1        1.049863
#> Festival                1.068981  2        1.016816
#> City                    1.047122  3        1.007704
#> Distance                1.268914  1        1.126461

Kesimpulan: Dari uji VIF, prediktor di model_backward lolos uji asumsi multicolinearity (tidak ada nilai VIF > 10)

14 KESIMPULAN DAN SARAN

Berdasarkan model yang telah dieksplorasi, bahwa prediksi durasi pengiriman atas data delivery makanan ini tidak dapat diselesaikan dengan metode regresi linear. Mungkin perlu tambahan prediktor berupa kategori cluster setiap lokasi restoran dan lokasi tujuan pengiriman. Jadi seluruh kota ditentukan cluster, dimana lokasi restoran dan lokasi tujuan akan di-mapping ke cluster tersebut.