The goal is to simulate an ecommerce environment where orders come in to an order management system and need to be assigned a shipping service before the order data drops into the distribution center’s warehouse management system.

We’ll create that process of assigning the shipping service, but doing so in a way that optimizes our business constraints (i.e. costs, customer service, etc.). We’ll be using linear programming and the lpSolveAPI library for the optimization.



Load packages
library(tidyverse)
library(data.table)
library(lpSolveAPI)
Load the data
orderData <- fread("./data/order_data.csv")
serviceData <- fread("./data/shipping_service_data.csv")

# complete some initial restructuring of columns
orderData$order_id <- as.factor(orderData$order_id)
serviceData$delivery_service <- as.factor(serviceData$delivery_service)
serviceData$cost_per_package <- as.integer(serviceData$cost_per_package)

Order Data

head(orderData)
##    order_id order_value distance_to_destination_mi date_ordered
## 1:        1          25                         20     3/1/2020
## 2:        2          35                         25     3/6/2020
## 3:        3          45                         10    2/27/2020
## 4:        4          25                         30    2/25/2020
## 5:        5          10                         50    3/15/2020
## 6:        6          25                          5     3/5/2020
##    promised_delivery_date
## 1:              3/11/2020
## 2:              3/15/2020
## 3:               3/1/2020
## 4:              2/28/2020
## 5:              3/16/2020
## 6:               3/7/2020
str(orderData)
## Classes 'data.table' and 'data.frame':   14 obs. of  5 variables:
##  $ order_id                  : Factor w/ 14 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ order_value               : int  25 35 45 25 10 25 35 45 25 10 ...
##  $ distance_to_destination_mi: int  20 25 10 30 50 5 8 60 90 15 ...
##  $ date_ordered              : chr  "3/1/2020" "3/6/2020" "2/27/2020" "2/25/2020" ...
##  $ promised_delivery_date    : chr  "3/11/2020" "3/15/2020" "3/1/2020" "2/28/2020" ...
##  - attr(*, ".internal.selfref")=<externalptr>

Service Data

head(serviceData)
##    delivery_service service_type days_in_transit miles_covered_per_day
## 1:            fedex     standard               7                     5
## 2:            fedex      express               2                    40
## 3:            fedex          air               1                   100
## 4:              ups     standard               6                     6
## 5:              ups      express               2                    35
## 6:             usps     standard              10                     5
##    cost_per_package
## 1:               10
## 2:               15
## 3:               30
## 4:               12
## 5:               14
## 6:                6
str(serviceData)
## Classes 'data.table' and 'data.frame':   7 obs. of  5 variables:
##  $ delivery_service     : Factor w/ 3 levels "fedex","ups",..: 1 1 1 2 2 3 3
##  $ service_type         : chr  "standard" "express" "air" "standard" ...
##  $ days_in_transit      : int  7 2 1 6 2 10 4
##  $ miles_covered_per_day: int  5 40 100 6 35 5 20
##  $ cost_per_package     : int  10 15 30 12 14 6 10
##  - attr(*, ".internal.selfref")=<externalptr>

Add Features

# order data
orderData <- orderData[,
                       `:=`(date_ordered = as.Date(date_ordered, "%m/%d/%Y"),
                            promised_delivery_date = 
                                as.Date(promised_delivery_date, "%m/%d/%Y"))]

orderData <- orderData[, days_to_deliver := difftime(promised_delivery_date, date_ordered,
                                                     units = 'days')]

orderData$days_to_deliver <- as.numeric(orderData$days_to_deliver)

orderData$package_cnt <- 1

# service data
serviceData <- serviceData[, total_miles := days_in_transit * miles_covered_per_day]

Service Data

head(serviceData)
##    delivery_service service_type days_in_transit miles_covered_per_day
## 1:            fedex     standard               7                     5
## 2:            fedex      express               2                    40
## 3:            fedex          air               1                   100
## 4:              ups     standard               6                     6
## 5:              ups      express               2                    35
## 6:             usps     standard              10                     5
##    cost_per_package total_miles
## 1:               10          35
## 2:               15          80
## 3:               30         100
## 4:               12          36
## 5:               14          70
## 6:                6          50
str(serviceData)
## Classes 'data.table' and 'data.frame':   7 obs. of  6 variables:
##  $ delivery_service     : Factor w/ 3 levels "fedex","ups",..: 1 1 1 2 2 3 3
##  $ service_type         : chr  "standard" "express" "air" "standard" ...
##  $ days_in_transit      : int  7 2 1 6 2 10 4
##  $ miles_covered_per_day: int  5 40 100 6 35 5 20
##  $ cost_per_package     : int  10 15 30 12 14 6 10
##  $ total_miles          : int  35 80 100 36 70 50 80
##  - attr(*, ".internal.selfref")=<externalptr>

Order Data

head(orderData)
##    order_id order_value distance_to_destination_mi date_ordered
## 1:        1          25                         20   2020-03-01
## 2:        2          35                         25   2020-03-06
## 3:        3          45                         10   2020-02-27
## 4:        4          25                         30   2020-02-25
## 5:        5          10                         50   2020-03-15
## 6:        6          25                          5   2020-03-05
##    promised_delivery_date days_to_deliver package_cnt
## 1:             2020-03-11              10           1
## 2:             2020-03-15               9           1
## 3:             2020-03-01               3           1
## 4:             2020-02-28               3           1
## 5:             2020-03-16               1           1
## 6:             2020-03-07               2           1
str(orderData)
## Classes 'data.table' and 'data.frame':   14 obs. of  7 variables:
##  $ order_id                  : Factor w/ 14 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ order_value               : int  25 35 45 25 10 25 35 45 25 10 ...
##  $ distance_to_destination_mi: int  20 25 10 30 50 5 8 60 90 15 ...
##  $ date_ordered              : Date, format: "2020-03-01" "2020-03-06" ...
##  $ promised_delivery_date    : Date, format: "2020-03-11" "2020-03-15" ...
##  $ days_to_deliver           : num  10 9 3 3 1 2 3 5 4 9 ...
##  $ package_cnt               : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>

Optimize Shipping Service by Carrier

shipping.opt.carrier <- function(order_data, service_data, carrier){
    
    # set up a matrix to output results into
    colCnt <- nrow(unique(service_data[service_data$delivery_service == carrier,
                                      "service_type"]))
    
    result_matrix <- matrix(0, nrow = nrow(order_data),
                            ncol = colCnt)
    
    row.names(result_matrix) <- order_data$order_id
    
    servTypes <- unique(service_data[service_data$delivery_service == carrier, 
                            "service_type"])
    
    colnames(result_matrix) <- c(servTypes$service_type)
    Cost <- matrix(0, nrow = nrow(order_data), ncol = 1)
    colnames(Cost) <- "Cost"
    result_matrix <- cbind(result_matrix, Cost)
    
    
        
    # for loop to solve optimization problem
        for(i in 1:nrow(order_data)){
            
            # start with 0 constraints and 3 decision variable
            lprec <- make.lp(nrow = 0, 
                             ncol = nrow(service_data[service_data$delivery_service == carrier,
                                                      "cost_per_package"]))
            
            # select minimize
            lp.control(lprec, sense = "min")
            
            # set type to integer
            set.type(lprec, 
                     columns = 1:nrow(service_data[service_data$delivery_service == carrier, 
                                                   "cost_per_package"]),
                     type = "integer")
            
            # set objective function coefficients
            coef <- service_data[service_data$delivery_service == carrier,
                                 "cost_per_package"]
            set.objfn(lprec, 
                      obj = coef$cost_per_package)
            
            # days_in_transit must be less than or equal to days_to_deliver
            daysToDel <- service_data[service_data$delivery_service == carrier,
                                               "days_in_transit"]
            add.constraint(lprec, 
                           xt = daysToDel$days_in_transit,
                           type = "<=", rhs = c(order_data[i, "days_to_deliver"]))
            
            # total_miles must be greater than or equal to distance_to_destination_mi
            totalMiles <- service_data[service_data$delivery_service == carrier,
                                               "total_miles"]
            add.constraint(lprec, 
                           xt = totalMiles$total_miles,
                           type = ">=", rhs = c(order_data[i, "distance_to_destination_mi"]))
            
            # must select one service
            selectOne <- rep(1, length(1:length(daysToDel$days_in_transit)))
            
            add.constraint(lprec, xt = selectOne,
                          type = "=", rhs = 1)
            
            # solve the lp
            solve(lprec)
            
            # store results in result_matrix
            matColcnt <- ncol(result_matrix) - 1
            result_matrix[i, 1:matColcnt] <- get.variables(lprec)
            result_matrix[i, "Cost"] <- get.objective(lprec)
            
        }
    
    # convert matrix to dataframe. Adding order_id and delivery_service column
    df <- data.frame(order_id = row.names(result_matrix), 
                     delivery_service = carrier,
                     result_matrix)
    
    # if lpsolve can't find a solution that meets all constraints it inputs
    # a large scientific number. Not sure the cause of this, but below code
    # is a temporary fix to turn these large numbers into NAs. 
    # Ignore warning message about NAs b/c thats what we want
    # round() needs to be present cauase as.integer will round down
    df$Cost <- suppressWarnings(as.integer(round(df$Cost)))
    
    # Using data.table for melt function to create tidy dataset
    df$Cost <- as.character(df$Cost) # convert to character for melt
    setDT(df) # convert from dataframe to datatable
    
    # suppress messages on melt (no issues)
    df <- suppressWarnings(melt(df,
                                variable.name = "service_type",
                                value.name = "ind"))
    df <- df[ind == 1, ] # filter
    df$Cost <- as.numeric(df$Cost)
    df <- df[, c("order_id", "delivery_service", "service_type", "Cost")]
    
    
    # returns a data.table/data.frame
    return(df)
    
}
        
fedExCost <- shipping.opt.carrier(order_data = orderData,
                          service_data = serviceData,
                          carrier = "fedex")

upsCost <- shipping.opt.carrier(order_data = orderData,
                          service_data = serviceData,
                          carrier = "ups")

uspsCost <- shipping.opt.carrier(order_data = orderData,
                          service_data = serviceData,
                          carrier = "usps")

Data by Carrier

head(fedExCost)
##    order_id delivery_service service_type Cost
## 1:        1            fedex     standard   10
## 2:        2            fedex     standard   10
## 3:       10            fedex     standard   10
## 4:        3            fedex      express   15
## 5:        4            fedex      express   15
## 6:        6            fedex      express   15
str(fedExCost)
## Classes 'data.table' and 'data.frame':   14 obs. of  4 variables:
##  $ order_id        : Factor w/ 14 levels "1","10","11",..: 1 7 2 8 9 11 12 13 3 4 ...
##  $ delivery_service: Factor w/ 1 level "fedex": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service_type    : Factor w/ 3 levels "standard","express",..: 1 1 1 2 2 2 2 2 2 2 ...
##  $ Cost            : num  10 10 10 15 15 15 15 15 15 15 ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(upsCost)
##    order_id delivery_service service_type Cost
## 1:        1              ups     standard   12
## 2:        2              ups     standard   12
## 3:       10              ups     standard   12
## 4:        3              ups      express   14
## 5:        4              ups      express   14
## 6:        6              ups      express   14
str(upsCost)
## Classes 'data.table' and 'data.frame':   12 obs. of  4 variables:
##  $ order_id        : Factor w/ 14 levels "1","10","11",..: 1 7 2 8 9 11 12 13 3 4 ...
##  $ delivery_service: Factor w/ 1 level "ups": 1 1 1 1 1 1 1 1 1 1 ...
##  $ service_type    : Factor w/ 2 levels "standard","express": 1 1 1 2 2 2 2 2 2 2 ...
##  $ Cost            : num  12 12 12 14 14 14 14 14 14 14 ...
##  - attr(*, ".internal.selfref")=<externalptr>
head(uspsCost)
##    order_id delivery_service service_type Cost
## 1:        1             usps     standard    6
## 2:        2             usps      express   10
## 3:        8             usps      express   10
## 4:       10             usps      express   10
## 5:       12             usps      express   10
## 6:       13             usps      express   10
str(uspsCost)
## Classes 'data.table' and 'data.frame':   7 obs. of  4 variables:
##  $ order_id        : Factor w/ 14 levels "1","10","11",..: 1 7 13 2 4 5 6
##  $ delivery_service: Factor w/ 1 level "usps": 1 1 1 1 1 1 1
##  $ service_type    : Factor w/ 2 levels "standard","express": 1 2 2 2 2 2 2
##  $ Cost            : num  6 10 10 10 10 10 10
##  - attr(*, ".internal.selfref")=<externalptr>

Choose Best Shipping Service Per Order

select.Service <- function(carrier_one, carrier_two, carrier_three){
    
    # stack all carrier tables together
    df <- suppressWarnings(bind_rows(carrier_one, 
                                     carrier_two, 
                                     carrier_three))
    
    # Adding a priority level for each carrier. This is simple logic to
    # solve tie breakers. In real life this might be based on commitments we've
    # made to carriers etc.
    df <- df %>% 
        mutate(carrier_rank = ifelse(delivery_service == 'fedex', 1,
                                     ifelse(delivery_service == 'usps', 2,
                                            ifelse(delivery_service == 'ups',
                                                   3, NA))))
    
    # Apply rank and filtering based on rank by order_id
    df <- df %>% 
        arrange(order_id, Cost, carrier_rank) %>% 
        group_by(order_id) %>% 
        mutate(rank = 1:n()) %>% 
        filter(rank == 1) %>% 
        select("order_id", "delivery_service",
               "service_type", "Cost") %>% 
        rename(shipping_cost = Cost)
    
    
    return(df)
}

servSelect <- select.Service(carrier_one = fedExCost,
                             carrier_two = upsCost,
                             carrier_three = uspsCost)

Add Shipping Info to Order Info

orderData <- merge(orderData, servSelect, 
                   by = 'order_id', all.x = TRUE)

head(orderData)
##    order_id order_value distance_to_destination_mi date_ordered
## 1:        1          25                         20   2020-03-01
## 2:        2          35                         25   2020-03-06
## 3:        3          45                         10   2020-02-27
## 4:        4          25                         30   2020-02-25
## 5:        5          10                         50   2020-03-15
## 6:        6          25                          5   2020-03-05
##    promised_delivery_date days_to_deliver package_cnt delivery_service
## 1:             2020-03-11              10           1             usps
## 2:             2020-03-15               9           1            fedex
## 3:             2020-03-01               3           1              ups
## 4:             2020-02-28               3           1              ups
## 5:             2020-03-16               1           1            fedex
## 6:             2020-03-07               2           1              ups
##    service_type shipping_cost
## 1:     standard             6
## 2:     standard            10
## 3:      express            14
## 4:      express            14
## 5:          air            30
## 6:      express            14
str(orderData)
## Classes 'data.table' and 'data.frame':   14 obs. of  10 variables:
##  $ order_id                  : Factor w/ 14 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ order_value               : int  25 35 45 25 10 25 35 45 25 10 ...
##  $ distance_to_destination_mi: int  20 25 10 30 50 5 8 60 90 15 ...
##  $ date_ordered              : Date, format: "2020-03-01" "2020-03-06" ...
##  $ promised_delivery_date    : Date, format: "2020-03-11" "2020-03-15" ...
##  $ days_to_deliver           : num  10 9 3 3 1 2 3 5 4 9 ...
##  $ package_cnt               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ delivery_service          : chr  "usps" "fedex" "ups" "ups" ...
##  $ service_type              : chr  "standard" "standard" "express" "express" ...
##  $ shipping_cost             : num  6 10 14 14 30 14 14 10 30 10 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "order_id"