library(readxl)
library(tidyverse)
library(caret)
library(ggplot2)
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)
library(randomForest)
library(xgboost)
library(GGally)
library(ggpubr)
library(nortest)
library(stringr)
library(writexl)
library(moments)
library(Metrics)
file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)
# make the names readable for R
colnames(price) <- make.names(colnames(price))
print(colnames(price))
## [1] "Company.Name"
## [2] "Business.Unit"
## [3] "Report.Period"
## [4] "Currency"
## [5] "Region"
## [6] "Transfer.Pricing.Method"
## [7] "Benchmarking"
## [8] "Transaction.Volume..Gallons."
## [9] "Unit.Price....Gallon."
## [10] "Market.Reference.Price....Gallon."
## [11] "Trade.Terms"
## [12] "Crude.Oil.Purchase.Price....Barrel."
## [13] "Refining.Cost....Gallon."
## [14] "Transport.Cost....Gallon."
## [15] "Storage.Cost....Gallon."
## [16] "Pipeline.Tariffs....Gallon."
## [17] "Terminal.Fees....Gallon."
## [18] "Operating.Expenses....Gallon."
## [19] "Depreciation.Costs....Gallon."
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"
## [22] "Market.Benchmark....Gallon."
## [23] "Profit.Margin...."
## [24] "Risk.Adjustments....Gallon."
## [25] "Net.Profit.Margin...."
## [26] "Return.on.Assets...."
## [27] "Competitor.Price....Gallon."
## [28] "Regional.Supply.Demand.Trends"
## [29] "Alternative.Energy.Impact"
## [30] "Excise.Tax....Gallon."
## [31] "Value.Added.Tax...."
## [32] "Carbon.Emission.Tax....Gallon."
## [33] "Market.Price.Volatility...."
## [34] "Exchange.Rate.Risk...."
## [35] "Geopolitical.Risk"
## [36] "Refinery.Utilization.Rate...."
## [37] "Processing.Capacity..Barrels.Day."
## [38] "Yield.per.Barrel..Gallons."
## [39] "Total.Transaction.Value...."
## [40] "Refinery.Gross.Margin....Gallon."
## [41] "Break.even.Price....Gallon."
## [42] "Product.Name"
# count the varibale has only one level
unique_counts <- sapply(price, function(x) n_distinct(x, na.rm = TRUE))
one_level_vars <- names(unique_counts[unique_counts == 1])
print(one_level_vars)
## [1] "Company.Name" "Business.Unit"
## [3] "Currency" "Region"
## [5] "Transfer.Pricing.Method" "Benchmarking"
## [7] "Transfer.Pricing.Formula" "Product.Name"
# look for missing data
sum(is.na(price))
## [1] 0
#look for duplicate data
sum(duplicated(price))
## [1] 0
str(price)
## tibble [366 × 42] (S3: tbl_df/tbl/data.frame)
## $ Company.Name : chr [1:366] "Global Energy Corp" "Global Energy Corp" "Global Energy Corp" "Global Energy Corp" ...
## $ Business.Unit : chr [1:366] "Refining & Trading" "Refining & Trading" "Refining & Trading" "Refining & Trading" ...
## $ Report.Period : chr [1:366] "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" ...
## $ Currency : chr [1:366] "USD" "USD" "USD" "USD" ...
## $ Region : chr [1:366] "Usa" "Usa" "Usa" "Usa" ...
## $ Transfer.Pricing.Method : chr [1:366] "Cost-Plus Pricing" "Cost-Plus Pricing" "Cost-Plus Pricing" "Cost-Plus Pricing" ...
## $ Benchmarking : chr [1:366] "Market Price Comparison" "Market Price Comparison" "Market Price Comparison" "Market Price Comparison" ...
## $ Transaction.Volume..Gallons. : num [1:366] 438352 184378 141337 43593 225908 ...
## $ Unit.Price....Gallon. : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon. : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : chr [1:366] "CIF" "FOB" "CIF" "CIF" ...
## $ Crude.Oil.Purchase.Price....Barrel. : num [1:366] 78.8 67 88.1 70.9 64.7 ...
## $ Refining.Cost....Gallon. : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
## $ Transport.Cost....Gallon. : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
## $ Storage.Cost....Gallon. : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
## $ Pipeline.Tariffs....Gallon. : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
## $ Terminal.Fees....Gallon. : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
## $ Operating.Expenses....Gallon. : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
## $ Depreciation.Costs....Gallon. : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
## $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
## $ Transfer.Pricing.Formula : chr [1:366] "Market Benchmark + Cost-Plus" "Market Benchmark + Cost-Plus" "Market Benchmark + Cost-Plus" "Market Benchmark + Cost-Plus" ...
## $ Market.Benchmark....Gallon. : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
## $ Profit.Margin.... : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
## $ Risk.Adjustments....Gallon. : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
## $ Net.Profit.Margin.... : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
## $ Return.on.Assets.... : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
## $ Competitor.Price....Gallon. : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
## $ Regional.Supply.Demand.Trends : chr [1:366] "Stable" "Stable" "Increasing Demand" "Stable" ...
## $ Alternative.Energy.Impact : chr [1:366] "High" "Moderate" "Moderate" "Moderate" ...
## $ Excise.Tax....Gallon. : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
## $ Value.Added.Tax.... : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
## $ Carbon.Emission.Tax....Gallon. : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
## $ Market.Price.Volatility.... : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
## $ Exchange.Rate.Risk.... : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : chr [1:366] "High" "High" "Low" "Medium" ...
## $ Refinery.Utilization.Rate.... : num [1:366] 80 91.3 91.3 86.3 90.8 ...
## $ Processing.Capacity..Barrels.Day. : num [1:366] 138129 355755 397510 174968 317043 ...
## $ Yield.per.Barrel..Gallons. : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
## $ Total.Transaction.Value.... : num [1:366] 1652587 731981 450865 169141 865228 ...
## $ Refinery.Gross.Margin....Gallon. : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
## $ Break.even.Price....Gallon. : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
## $ Product.Name : chr [1:366] "Gasoline" "Gasoline" "Gasoline" "Gasoline" ...
#chagne chr to factor for modeleling
price <- price %>% mutate_if(is.character, as.factor)
str(price)
## tibble [366 × 42] (S3: tbl_df/tbl/data.frame)
## $ Company.Name : Factor w/ 1 level "Global Energy Corp": 1 1 1 1 1 1 1 1 1 1 ...
## $ Business.Unit : Factor w/ 1 level "Refining & Trading": 1 1 1 1 1 1 1 1 1 1 ...
## $ Report.Period : Factor w/ 366 levels "2024-01-01","2024-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Region : Factor w/ 1 level "Usa": 1 1 1 1 1 1 1 1 1 1 ...
## $ Transfer.Pricing.Method : Factor w/ 1 level "Cost-Plus Pricing": 1 1 1 1 1 1 1 1 1 1 ...
## $ Benchmarking : Factor w/ 1 level "Market Price Comparison": 1 1 1 1 1 1 1 1 1 1 ...
## $ Transaction.Volume..Gallons. : num [1:366] 438352 184378 141337 43593 225908 ...
## $ Unit.Price....Gallon. : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon. : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
## $ Crude.Oil.Purchase.Price....Barrel. : num [1:366] 78.8 67 88.1 70.9 64.7 ...
## $ Refining.Cost....Gallon. : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
## $ Transport.Cost....Gallon. : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
## $ Storage.Cost....Gallon. : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
## $ Pipeline.Tariffs....Gallon. : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
## $ Terminal.Fees....Gallon. : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
## $ Operating.Expenses....Gallon. : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
## $ Depreciation.Costs....Gallon. : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
## $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
## $ Transfer.Pricing.Formula : Factor w/ 1 level "Market Benchmark + Cost-Plus": 1 1 1 1 1 1 1 1 1 1 ...
## $ Market.Benchmark....Gallon. : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
## $ Profit.Margin.... : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
## $ Risk.Adjustments....Gallon. : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
## $ Net.Profit.Margin.... : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
## $ Return.on.Assets.... : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
## $ Competitor.Price....Gallon. : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
## $ Regional.Supply.Demand.Trends : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
## $ Alternative.Energy.Impact : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
## $ Excise.Tax....Gallon. : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
## $ Value.Added.Tax.... : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
## $ Carbon.Emission.Tax....Gallon. : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
## $ Market.Price.Volatility.... : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
## $ Exchange.Rate.Risk.... : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
## $ Refinery.Utilization.Rate.... : num [1:366] 80 91.3 91.3 86.3 90.8 ...
## $ Processing.Capacity..Barrels.Day. : num [1:366] 138129 355755 397510 174968 317043 ...
## $ Yield.per.Barrel..Gallons. : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
## $ Total.Transaction.Value.... : num [1:366] 1652587 731981 450865 169141 865228 ...
## $ Refinery.Gross.Margin....Gallon. : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
## $ Break.even.Price....Gallon. : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
## $ Product.Name : Factor w/ 1 level "Gasoline": 1 1 1 1 1 1 1 1 1 1 ...
#data summary
summary(price)
## Company.Name Business.Unit Report.Period Currency
## Global Energy Corp:366 Refining & Trading:366 2024-01-01: 1 USD:366
## 2024-01-02: 1
## 2024-01-03: 1
## 2024-01-04: 1
## 2024-01-05: 1
## 2024-01-06: 1
## (Other) :360
## Region Transfer.Pricing.Method Benchmarking
## Usa:366 Cost-Plus Pricing:366 Market Price Comparison:366
##
##
##
##
##
##
## Transaction.Volume..Gallons. Unit.Price....Gallon.
## Min. : 11043 Min. :2.500
## 1st Qu.:132563 1st Qu.:2.890
## Median :238005 Median :3.285
## Mean :246641 Mean :3.257
## 3rd Qu.:366264 3rd Qu.:3.600
## Max. :497593 Max. :4.000
##
## Market.Reference.Price....Gallon. Trade.Terms
## Min. :2.510 CIF:131
## 1st Qu.:2.973 DAP:103
## Median :3.395 FOB:132
## Mean :3.369
## 3rd Qu.:3.780
## Max. :4.200
##
## Crude.Oil.Purchase.Price....Barrel. Refining.Cost....Gallon.
## Min. :60.12 Min. :0.2000
## 1st Qu.:66.25 1st Qu.:0.2700
## Median :74.02 Median :0.3400
## Mean :74.39 Mean :0.3434
## 3rd Qu.:82.21 3rd Qu.:0.4200
## Max. :89.88 Max. :0.5000
##
## Transport.Cost....Gallon. Storage.Cost....Gallon. Pipeline.Tariffs....Gallon.
## Min. :0.0500 Min. :0.02000 Min. :0.01000
## 1st Qu.:0.0800 1st Qu.:0.04000 1st Qu.:0.02000
## Median :0.1000 Median :0.05000 Median :0.03000
## Mean :0.1014 Mean :0.04959 Mean :0.03011
## 3rd Qu.:0.1300 3rd Qu.:0.06000 3rd Qu.:0.04000
## Max. :0.1500 Max. :0.08000 Max. :0.05000
##
## Terminal.Fees....Gallon. Operating.Expenses....Gallon.
## Min. :0.02000 Min. :0.1000
## 1st Qu.:0.03000 1st Qu.:0.1500
## Median :0.04000 Median :0.2100
## Mean :0.03975 Mean :0.2015
## 3rd Qu.:0.05000 3rd Qu.:0.2500
## Max. :0.06000 Max. :0.3000
##
## Depreciation.Costs....Gallon. Environmental.Compliance.Costs....Gallon.
## Min. :0.0500 Min. :0.01000
## 1st Qu.:0.0800 1st Qu.:0.02000
## Median :0.1200 Median :0.03000
## Mean :0.1240 Mean :0.03005
## 3rd Qu.:0.1675 3rd Qu.:0.04000
## Max. :0.2000 Max. :0.05000
##
## Transfer.Pricing.Formula Market.Benchmark....Gallon.
## Market Benchmark + Cost-Plus:366 Min. :2.600
## 1st Qu.:2.940
## Median :3.285
## Mean :3.284
## 3rd Qu.:3.660
## Max. :4.000
##
## Profit.Margin.... Risk.Adjustments....Gallon. Net.Profit.Margin....
## Min. : 5.040 Min. :0.01000 Min. :3.020
## 1st Qu.: 7.293 1st Qu.:0.03000 1st Qu.:4.825
## Median : 9.575 Median :0.05000 Median :6.590
## Mean : 9.783 Mean :0.05413 Mean :6.489
## 3rd Qu.:11.985 3rd Qu.:0.08000 3rd Qu.:8.137
## Max. :14.950 Max. :0.10000 Max. :9.960
##
## Return.on.Assets.... Competitor.Price....Gallon.
## Min. :2.010 Min. :2.510
## 1st Qu.:3.442 1st Qu.:2.960
## Median :4.940 Median :3.340
## Mean :4.949 Mean :3.333
## 3rd Qu.:6.410 3rd Qu.:3.720
## Max. :7.990 Max. :4.090
##
## Regional.Supply.Demand.Trends Alternative.Energy.Impact
## Decreasing Demand:118 High :117
## Increasing Demand:124 Low :123
## Stable :124 Moderate:126
##
##
##
##
## Excise.Tax....Gallon. Value.Added.Tax.... Carbon.Emission.Tax....Gallon.
## Min. :0.050 Min. : 5.070 Min. :0.01000
## 1st Qu.:0.090 1st Qu.: 9.367 1st Qu.:0.03000
## Median :0.120 Median :13.160 Median :0.05000
## Mean :0.123 Mean :12.759 Mean :0.05434
## 3rd Qu.:0.160 3rd Qu.:16.137 3rd Qu.:0.08000
## Max. :0.200 Max. :19.900 Max. :0.10000
##
## Market.Price.Volatility.... Exchange.Rate.Risk.... Geopolitical.Risk
## Min. :1.040 Min. :0.510 High :123
## 1st Qu.:3.103 1st Qu.:1.512 Low :113
## Median :5.110 Median :2.660 Medium:130
## Mean :5.416 Mean :2.648
## 3rd Qu.:7.577 3rd Qu.:3.690
## Max. :9.980 Max. :5.000
##
## Refinery.Utilization.Rate.... Processing.Capacity..Barrels.Day.
## Min. :75.03 Min. :100399
## 1st Qu.:80.32 1st Qu.:197118
## Median :85.64 Median :297231
## Mean :85.37 Mean :295243
## 3rd Qu.:90.46 3rd Qu.:392937
## Max. :94.98 Max. :499672
##
## Yield.per.Barrel..Gallons. Total.Transaction.Value....
## Min. :30.0 Min. : 31141
## 1st Qu.:33.0 1st Qu.: 421065
## Median :36.0 Median : 764578
## Mean :35.5 Mean : 805110
## 3rd Qu.:39.0 3rd Qu.:1202052
## Max. :41.0 Max. :1946554
##
## Refinery.Gross.Margin....Gallon. Break.even.Price....Gallon. Product.Name
## Min. :2.080 Min. :0.2500 Gasoline:366
## 1st Qu.:2.603 1st Qu.:0.3700
## Median :3.065 Median :0.4400
## Mean :3.025 Mean :0.4449
## 3rd Qu.:3.428 3rd Qu.:0.5200
## Max. :3.970 Max. :0.6500
##
# look for distinct of each variable
numeric_vars <- price[, sapply(price, is.numeric), drop = FALSE]
num_unique_values <- sapply(numeric_vars, n_distinct)
print(num_unique_values)
## Transaction.Volume..Gallons.
## 365
## Unit.Price....Gallon.
## 132
## Market.Reference.Price....Gallon.
## 153
## Crude.Oil.Purchase.Price....Barrel.
## 348
## Refining.Cost....Gallon.
## 31
## Transport.Cost....Gallon.
## 11
## Storage.Cost....Gallon.
## 7
## Pipeline.Tariffs....Gallon.
## 5
## Terminal.Fees....Gallon.
## 5
## Operating.Expenses....Gallon.
## 21
## Depreciation.Costs....Gallon.
## 16
## Environmental.Compliance.Costs....Gallon.
## 5
## Market.Benchmark....Gallon.
## 133
## Profit.Margin....
## 303
## Risk.Adjustments....Gallon.
## 10
## Net.Profit.Margin....
## 285
## Return.on.Assets....
## 265
## Competitor.Price....Gallon.
## 143
## Excise.Tax....Gallon.
## 16
## Value.Added.Tax....
## 325
## Carbon.Emission.Tax....Gallon.
## 10
## Market.Price.Volatility....
## 297
## Exchange.Rate.Risk....
## 253
## Refinery.Utilization.Rate....
## 333
## Processing.Capacity..Barrels.Day.
## 366
## Yield.per.Barrel..Gallons.
## 12
## Total.Transaction.Value....
## 366
## Refinery.Gross.Margin....Gallon.
## 154
## Break.even.Price....Gallon.
## 48
# Convert columns with <10 unique values to factors
price <- as.data.frame(lapply(price, function(x) if (length(unique(x)) < 10) as.factor(x) else x))
# Check structure
str(price)
## 'data.frame': 366 obs. of 42 variables:
## $ Company.Name : Factor w/ 1 level "Global Energy Corp": 1 1 1 1 1 1 1 1 1 1 ...
## $ Business.Unit : Factor w/ 1 level "Refining & Trading": 1 1 1 1 1 1 1 1 1 1 ...
## $ Report.Period : Factor w/ 366 levels "2024-01-01","2024-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ Currency : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
## $ Region : Factor w/ 1 level "Usa": 1 1 1 1 1 1 1 1 1 1 ...
## $ Transfer.Pricing.Method : Factor w/ 1 level "Cost-Plus Pricing": 1 1 1 1 1 1 1 1 1 1 ...
## $ Benchmarking : Factor w/ 1 level "Market Price Comparison": 1 1 1 1 1 1 1 1 1 1 ...
## $ Transaction.Volume..Gallons. : num 438352 184378 141337 43593 225908 ...
## $ Unit.Price....Gallon. : num 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon. : num 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
## $ Crude.Oil.Purchase.Price....Barrel. : num 78.8 67 88.1 70.9 64.7 ...
## $ Refining.Cost....Gallon. : num 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
## $ Transport.Cost....Gallon. : num 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
## $ Storage.Cost....Gallon. : Factor w/ 7 levels "0.02","0.03",..: 4 6 3 2 4 4 4 6 6 6 ...
## $ Pipeline.Tariffs....Gallon. : Factor w/ 5 levels "0.01","0.02",..: 5 1 1 4 3 4 2 4 1 3 ...
## $ Terminal.Fees....Gallon. : Factor w/ 5 levels "0.02","0.03",..: 5 5 3 3 1 5 2 2 4 2 ...
## $ Operating.Expenses....Gallon. : num 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
## $ Depreciation.Costs....Gallon. : num 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
## $ Environmental.Compliance.Costs....Gallon.: Factor w/ 5 levels "0.01","0.02",..: 3 3 2 4 5 2 5 3 2 1 ...
## $ Transfer.Pricing.Formula : Factor w/ 1 level "Market Benchmark + Cost-Plus": 1 1 1 1 1 1 1 1 1 1 ...
## $ Market.Benchmark....Gallon. : num 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
## $ Profit.Margin.... : num 11.94 12.13 6.39 13.06 13.81 ...
## $ Risk.Adjustments....Gallon. : num 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
## $ Net.Profit.Margin.... : num 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
## $ Return.on.Assets.... : num 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
## $ Competitor.Price....Gallon. : num 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
## $ Regional.Supply.Demand.Trends : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
## $ Alternative.Energy.Impact : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
## $ Excise.Tax....Gallon. : num 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
## $ Value.Added.Tax.... : num 17.6 11.42 9.4 10.62 7.86 ...
## $ Carbon.Emission.Tax....Gallon. : num 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
## $ Market.Price.Volatility.... : num 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
## $ Exchange.Rate.Risk.... : num 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
## $ Refinery.Utilization.Rate.... : num 80 91.3 91.3 86.3 90.8 ...
## $ Processing.Capacity..Barrels.Day. : num 138129 355755 397510 174968 317043 ...
## $ Yield.per.Barrel..Gallons. : num 33 31 35 34 34 34 30 32 36 32 ...
## $ Total.Transaction.Value.... : num 1652587 731981 450865 169141 865228 ...
## $ Refinery.Gross.Margin....Gallon. : num 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
## $ Break.even.Price....Gallon. : num 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
## $ Product.Name : Factor w/ 1 level "Gasoline": 1 1 1 1 1 1 1 1 1 1 ...
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.character), drop = FALSE]
numeric <- numeric %>%
rename_with(~ str_wrap(.x, width = 20))
numeric %>%
gather(key = "Variable", value = "Value") %>%
ggplot(aes(x = Value)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
geom_density(color = "blue", size = 0.5) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
theme(strip.text = element_text(size = 6)) +
ggtitle("Histogram and Density Plot of Numeric Variables")

# format the plot
numeric <- numeric %>%
rename_with(~ stringr::str_wrap(.x, width = 20))
numeric <- numeric %>%
gather(key = "Variable", value = "Value")
# Q-Q Plot
ggplot(numeric, aes(sample = Value)) +
stat_qq() +
stat_qq_line(color = "red") +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
theme(strip.text = element_text(size = 6)) +
ggtitle("Q-Q Plot of Numeric Variables")

# look for num outliers
numeric_vars <- price %>% dplyr::select(where(is.numeric))
outliers <- lapply(numeric_vars, function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
which(x < lower_bound | x > upper_bound)
})
outliers
## $Transaction.Volume..Gallons.
## integer(0)
##
## $Unit.Price....Gallon.
## integer(0)
##
## $Market.Reference.Price....Gallon.
## integer(0)
##
## $Crude.Oil.Purchase.Price....Barrel.
## integer(0)
##
## $Refining.Cost....Gallon.
## integer(0)
##
## $Transport.Cost....Gallon.
## integer(0)
##
## $Operating.Expenses....Gallon.
## integer(0)
##
## $Depreciation.Costs....Gallon.
## integer(0)
##
## $Market.Benchmark....Gallon.
## integer(0)
##
## $Profit.Margin....
## integer(0)
##
## $Risk.Adjustments....Gallon.
## integer(0)
##
## $Net.Profit.Margin....
## integer(0)
##
## $Return.on.Assets....
## integer(0)
##
## $Competitor.Price....Gallon.
## integer(0)
##
## $Excise.Tax....Gallon.
## integer(0)
##
## $Value.Added.Tax....
## integer(0)
##
## $Carbon.Emission.Tax....Gallon.
## integer(0)
##
## $Market.Price.Volatility....
## integer(0)
##
## $Exchange.Rate.Risk....
## integer(0)
##
## $Refinery.Utilization.Rate....
## integer(0)
##
## $Processing.Capacity..Barrels.Day.
## integer(0)
##
## $Yield.per.Barrel..Gallons.
## integer(0)
##
## $Total.Transaction.Value....
## integer(0)
##
## $Refinery.Gross.Margin....Gallon.
## integer(0)
##
## $Break.even.Price....Gallon.
## integer(0)
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
# look for the for the correlation between num and y
cor_results <- cor(numeric, use = "complete.obs")
cor_results["Unit.Price....Gallon.", ]
## Transaction.Volume..Gallons. Unit.Price....Gallon.
## 0.031694613 1.000000000
## Market.Reference.Price....Gallon. Crude.Oil.Purchase.Price....Barrel.
## 0.127159227 0.019416446
## Refining.Cost....Gallon. Transport.Cost....Gallon.
## -0.059677350 0.032534250
## Operating.Expenses....Gallon. Depreciation.Costs....Gallon.
## 0.047817560 -0.050515793
## Market.Benchmark....Gallon. Profit.Margin....
## -0.046882739 -0.076025599
## Risk.Adjustments....Gallon. Net.Profit.Margin....
## -0.054379813 -0.078125814
## Return.on.Assets.... Competitor.Price....Gallon.
## 0.008796997 0.083188138
## Excise.Tax....Gallon. Value.Added.Tax....
## -0.037687003 0.045761039
## Carbon.Emission.Tax....Gallon. Market.Price.Volatility....
## -0.004259401 -0.020275851
## Exchange.Rate.Risk.... Refinery.Utilization.Rate....
## 0.103598985 0.048157393
## Processing.Capacity..Barrels.Day. Yield.per.Barrel..Gallons.
## -0.018505055 0.107334769
## Total.Transaction.Value.... Refinery.Gross.Margin....Gallon.
## 0.253527157 0.136125104
## Break.even.Price....Gallon.
## -0.045477075
#cor > 0.5 →positive relation, price go up when ind bigger
#cor < -0.5 → neg relation, price go down when ind bigger
#cor ≈ 0 → no correlation
#remove one level factor
factor <- price[, sapply(price, is.character), drop = FALSE]
unique_counts <- sapply(factor, n_distinct)
one_level_factors <- names(unique_counts[unique_counts == 1])
print(one_level_factors)
## character(0)
dim(price)
## [1] 366 42
#correlation look for correlation between num
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# (|cor| > 0.7)
high_cor_pairs <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)
high_cor_df <- data.frame(
Variable_1 = rownames(cor_matrix)[high_cor_pairs[, 1]],
Variable_2 = colnames(cor_matrix)[high_cor_pairs[, 2]],
Correlation = cor_matrix[high_cor_pairs]
)
high_cor_df <- high_cor_df %>%
distinct(Variable_1, Variable_2, .keep_all = TRUE)
print(high_cor_df)
## Variable_1 Variable_2
## 1 Total.Transaction.Value.... Transaction.Volume..Gallons.
## 2 Refinery.Gross.Margin....Gallon. Market.Reference.Price....Gallon.
## 3 Break.even.Price....Gallon. Refining.Cost....Gallon.
## 4 Transaction.Volume..Gallons. Total.Transaction.Value....
## 5 Market.Reference.Price....Gallon. Refinery.Gross.Margin....Gallon.
## 6 Refining.Cost....Gallon. Break.even.Price....Gallon.
## Correlation
## 1 0.9667413
## 2 0.9841007
## 3 0.9452115
## 4 0.9667413
## 5 0.9841007
## 6 0.9452115
#delete the high correlated
price <- price[, setdiff(names(price), c("Total.Transaction.Value....",
"Refinery.Gross.Margin....Gallon.",
"Break.even.Price....Gallon.")), drop = FALSE]
names(price)
## [1] "Company.Name"
## [2] "Business.Unit"
## [3] "Report.Period"
## [4] "Currency"
## [5] "Region"
## [6] "Transfer.Pricing.Method"
## [7] "Benchmarking"
## [8] "Transaction.Volume..Gallons."
## [9] "Unit.Price....Gallon."
## [10] "Market.Reference.Price....Gallon."
## [11] "Trade.Terms"
## [12] "Crude.Oil.Purchase.Price....Barrel."
## [13] "Refining.Cost....Gallon."
## [14] "Transport.Cost....Gallon."
## [15] "Storage.Cost....Gallon."
## [16] "Pipeline.Tariffs....Gallon."
## [17] "Terminal.Fees....Gallon."
## [18] "Operating.Expenses....Gallon."
## [19] "Depreciation.Costs....Gallon."
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"
## [22] "Market.Benchmark....Gallon."
## [23] "Profit.Margin...."
## [24] "Risk.Adjustments....Gallon."
## [25] "Net.Profit.Margin...."
## [26] "Return.on.Assets...."
## [27] "Competitor.Price....Gallon."
## [28] "Regional.Supply.Demand.Trends"
## [29] "Alternative.Energy.Impact"
## [30] "Excise.Tax....Gallon."
## [31] "Value.Added.Tax...."
## [32] "Carbon.Emission.Tax....Gallon."
## [33] "Market.Price.Volatility...."
## [34] "Exchange.Rate.Risk...."
## [35] "Geopolitical.Risk"
## [36] "Refinery.Utilization.Rate...."
## [37] "Processing.Capacity..Barrels.Day."
## [38] "Yield.per.Barrel..Gallons."
## [39] "Product.Name"
#build linear regression model
price <- as.data.frame(price) %>%
mutate(across(where(is.character), as.factor))
set.seed(5678)
train_index <- sample(1:nrow(price), 0.8 * nrow(price))
train_data <- price[train_index, ]
test_data <- price[-train_index, ]
factor_cols <- names(train_data)[sapply(train_data, is.factor)]
single_level_factors <- factor_cols[sapply(train_data[factor_cols], function(x) length(unique(x)) == 1)]
if (length(single_level_factors) > 0) {
train_data <- train_data[, !(names(train_data) %in% single_level_factors)]
}
# remove Report.Period
train_data <- train_data %>% dplyr::select(-Report.Period)
test_data <- test_data %>% dplyr::select(-Report.Period)
lm_model <- lm(`Unit.Price....Gallon.` ~ ., data = train_data)
train_preds <- predict(lm_model, newdata = train_data)
test_preds <- predict(lm_model, newdata = test_data)
# MSE
mse_train <- mean((train_data$`Unit.Price....Gallon.` - train_preds)^2)
mse_test <- mean((test_data$`Unit.Price....Gallon.` - test_preds)^2)
# RMSE
rmse_train <- sqrt(mse_train)
rmse_test <- sqrt(mse_test)
# R²
r_squared_train <- summary(lm_model)$r.squared
r_squared_test <- 1 - sum((test_preds - test_data$`Unit.Price....Gallon.`)^2) /
sum((test_data$`Unit.Price....Gallon.` - mean(test_data$`Unit.Price....Gallon.`))^2)
cat("📌 **模型评估结果**\n")
## 📌 **模型评估结果**
cat("✅ MSE:", mse_train, "\n")
## ✅ MSE: 0.1510838
cat("✅ MSE:", mse_test, "\n")
## ✅ MSE: 0.1711638
cat("✅ RMSE:", rmse_train, "\n")
## ✅ RMSE: 0.388695
cat("✅ RMSE:", rmse_test, "\n")
## ✅ RMSE: 0.4137195
cat("✅ R²:", r_squared_train, "\n")
## ✅ R²: 0.1716628
cat("✅ R²:", r_squared_test, "\n")
## ✅ R²: -0.008538477
# second model random forest
file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)
colnames(price) <- make.names(colnames(price))
print(colnames(price))
## [1] "Company.Name"
## [2] "Business.Unit"
## [3] "Report.Period"
## [4] "Currency"
## [5] "Region"
## [6] "Transfer.Pricing.Method"
## [7] "Benchmarking"
## [8] "Transaction.Volume..Gallons."
## [9] "Unit.Price....Gallon."
## [10] "Market.Reference.Price....Gallon."
## [11] "Trade.Terms"
## [12] "Crude.Oil.Purchase.Price....Barrel."
## [13] "Refining.Cost....Gallon."
## [14] "Transport.Cost....Gallon."
## [15] "Storage.Cost....Gallon."
## [16] "Pipeline.Tariffs....Gallon."
## [17] "Terminal.Fees....Gallon."
## [18] "Operating.Expenses....Gallon."
## [19] "Depreciation.Costs....Gallon."
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"
## [22] "Market.Benchmark....Gallon."
## [23] "Profit.Margin...."
## [24] "Risk.Adjustments....Gallon."
## [25] "Net.Profit.Margin...."
## [26] "Return.on.Assets...."
## [27] "Competitor.Price....Gallon."
## [28] "Regional.Supply.Demand.Trends"
## [29] "Alternative.Energy.Impact"
## [30] "Excise.Tax....Gallon."
## [31] "Value.Added.Tax...."
## [32] "Carbon.Emission.Tax....Gallon."
## [33] "Market.Price.Volatility...."
## [34] "Exchange.Rate.Risk...."
## [35] "Geopolitical.Risk"
## [36] "Refinery.Utilization.Rate...."
## [37] "Processing.Capacity..Barrels.Day."
## [38] "Yield.per.Barrel..Gallons."
## [39] "Total.Transaction.Value...."
## [40] "Refinery.Gross.Margin....Gallon."
## [41] "Break.even.Price....Gallon."
## [42] "Product.Name"
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.character), drop = FALSE]
set.seed(456)
trainIndex <- sample(1:nrow(price), 0.8 * nrow(price))
train_data <- price[trainIndex, ]
test_data <- price[-trainIndex, ]
train_data <- train_data[, !(sapply(train_data, function(x) is.factor(x) && length(unique(x)) > 53))]
model_rf <- randomForest(Unit.Price....Gallon. ~ ., data = train_data, ntree = 500, importance = TRUE)
predictions_rf <- predict(model_rf, test_data)
# R² RMSE
r2_rf <- cor(test_data$Unit.Price....Gallon., predictions_rf)^2
rmse_rf <- sqrt(mean((test_data$Unit.Price....Gallon. - predictions_rf)^2))
#
cat("📌 Random Forest 结果:\n")
## 📌 Random Forest 结果:
cat("✅ R²:", r2_rf, "\n")
## ✅ R²: 0.1568565
cat("✅ RMSE:", rmse_rf, "\n")
## ✅ RMSE: 0.409466
importance(model_rf)
## %IncMSE IncNodePurity
## Company.Name 0.00000000 0.0000000
## Business.Unit 0.00000000 0.0000000
## Report.Period -1.92583655 1.6338530
## Currency 0.00000000 0.0000000
## Region 0.00000000 0.0000000
## Transfer.Pricing.Method 0.00000000 0.0000000
## Benchmarking 0.00000000 0.0000000
## Transaction.Volume..Gallons. 13.59693177 3.7134133
## Market.Reference.Price....Gallon. 3.61228469 1.8830174
## Trade.Terms 0.27723692 0.3111749
## Crude.Oil.Purchase.Price....Barrel. 0.79436861 2.1062521
## Refining.Cost....Gallon. 0.61891180 1.1979896
## Transport.Cost....Gallon. 0.35355994 1.1265967
## Storage.Cost....Gallon. -1.82136601 0.6494871
## Pipeline.Tariffs....Gallon. 0.06568102 0.5803790
## Terminal.Fees....Gallon. 1.84934412 0.7097186
## Operating.Expenses....Gallon. 1.97593228 1.1786858
## Depreciation.Costs....Gallon. 1.75166526 1.2959349
## Environmental.Compliance.Costs....Gallon. -1.29428093 0.4902236
## Transfer.Pricing.Formula 0.00000000 0.0000000
## Market.Benchmark....Gallon. -0.96554040 1.5845444
## Profit.Margin.... -0.30867093 1.7417089
## Risk.Adjustments....Gallon. 0.23128235 0.8994355
## Net.Profit.Margin.... 1.81687847 2.2295281
## Return.on.Assets.... 0.97520885 1.4943019
## Competitor.Price....Gallon. 1.40757782 1.5575766
## Regional.Supply.Demand.Trends 1.96960141 0.4883561
## Alternative.Energy.Impact -1.33935989 0.3630602
## Excise.Tax....Gallon. 0.51935085 1.3067210
## Value.Added.Tax.... -1.39098930 1.8127782
## Carbon.Emission.Tax....Gallon. -0.77386478 0.9607585
## Market.Price.Volatility.... -2.74123895 1.4416808
## Exchange.Rate.Risk.... 1.25315920 1.6983066
## Geopolitical.Risk -0.64811050 0.3381729
## Refinery.Utilization.Rate.... 0.60920090 1.7349602
## Processing.Capacity..Barrels.Day. 0.58056990 1.9898112
## Yield.per.Barrel..Gallons. 0.32233402 1.2043611
## Total.Transaction.Value.... 23.87195099 6.8260299
## Refinery.Gross.Margin....Gallon. 5.27824505 2.0006795
## Break.even.Price....Gallon. 1.78227975 1.3484866
## Product.Name 0.00000000 0.0000000
varImpPlot(model_rf)

importance(model_rf)
## %IncMSE IncNodePurity
## Company.Name 0.00000000 0.0000000
## Business.Unit 0.00000000 0.0000000
## Report.Period -1.92583655 1.6338530
## Currency 0.00000000 0.0000000
## Region 0.00000000 0.0000000
## Transfer.Pricing.Method 0.00000000 0.0000000
## Benchmarking 0.00000000 0.0000000
## Transaction.Volume..Gallons. 13.59693177 3.7134133
## Market.Reference.Price....Gallon. 3.61228469 1.8830174
## Trade.Terms 0.27723692 0.3111749
## Crude.Oil.Purchase.Price....Barrel. 0.79436861 2.1062521
## Refining.Cost....Gallon. 0.61891180 1.1979896
## Transport.Cost....Gallon. 0.35355994 1.1265967
## Storage.Cost....Gallon. -1.82136601 0.6494871
## Pipeline.Tariffs....Gallon. 0.06568102 0.5803790
## Terminal.Fees....Gallon. 1.84934412 0.7097186
## Operating.Expenses....Gallon. 1.97593228 1.1786858
## Depreciation.Costs....Gallon. 1.75166526 1.2959349
## Environmental.Compliance.Costs....Gallon. -1.29428093 0.4902236
## Transfer.Pricing.Formula 0.00000000 0.0000000
## Market.Benchmark....Gallon. -0.96554040 1.5845444
## Profit.Margin.... -0.30867093 1.7417089
## Risk.Adjustments....Gallon. 0.23128235 0.8994355
## Net.Profit.Margin.... 1.81687847 2.2295281
## Return.on.Assets.... 0.97520885 1.4943019
## Competitor.Price....Gallon. 1.40757782 1.5575766
## Regional.Supply.Demand.Trends 1.96960141 0.4883561
## Alternative.Energy.Impact -1.33935989 0.3630602
## Excise.Tax....Gallon. 0.51935085 1.3067210
## Value.Added.Tax.... -1.39098930 1.8127782
## Carbon.Emission.Tax....Gallon. -0.77386478 0.9607585
## Market.Price.Volatility.... -2.74123895 1.4416808
## Exchange.Rate.Risk.... 1.25315920 1.6983066
## Geopolitical.Risk -0.64811050 0.3381729
## Refinery.Utilization.Rate.... 0.60920090 1.7349602
## Processing.Capacity..Barrels.Day. 0.58056990 1.9898112
## Yield.per.Barrel..Gallons. 0.32233402 1.2043611
## Total.Transaction.Value.... 23.87195099 6.8260299
## Refinery.Gross.Margin....Gallon. 5.27824505 2.0006795
## Break.even.Price....Gallon. 1.78227975 1.3484866
## Product.Name 0.00000000 0.0000000
#Model 3 XGBoost
file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)
colnames(price) <- make.names(colnames(price))
print(colnames(price))
## [1] "Company.Name"
## [2] "Business.Unit"
## [3] "Report.Period"
## [4] "Currency"
## [5] "Region"
## [6] "Transfer.Pricing.Method"
## [7] "Benchmarking"
## [8] "Transaction.Volume..Gallons."
## [9] "Unit.Price....Gallon."
## [10] "Market.Reference.Price....Gallon."
## [11] "Trade.Terms"
## [12] "Crude.Oil.Purchase.Price....Barrel."
## [13] "Refining.Cost....Gallon."
## [14] "Transport.Cost....Gallon."
## [15] "Storage.Cost....Gallon."
## [16] "Pipeline.Tariffs....Gallon."
## [17] "Terminal.Fees....Gallon."
## [18] "Operating.Expenses....Gallon."
## [19] "Depreciation.Costs....Gallon."
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"
## [22] "Market.Benchmark....Gallon."
## [23] "Profit.Margin...."
## [24] "Risk.Adjustments....Gallon."
## [25] "Net.Profit.Margin...."
## [26] "Return.on.Assets...."
## [27] "Competitor.Price....Gallon."
## [28] "Regional.Supply.Demand.Trends"
## [29] "Alternative.Energy.Impact"
## [30] "Excise.Tax....Gallon."
## [31] "Value.Added.Tax...."
## [32] "Carbon.Emission.Tax....Gallon."
## [33] "Market.Price.Volatility...."
## [34] "Exchange.Rate.Risk...."
## [35] "Geopolitical.Risk"
## [36] "Refinery.Utilization.Rate...."
## [37] "Processing.Capacity..Barrels.Day."
## [38] "Yield.per.Barrel..Gallons."
## [39] "Total.Transaction.Value...."
## [40] "Refinery.Gross.Margin....Gallon."
## [41] "Break.even.Price....Gallon."
## [42] "Product.Name"
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.character), drop = FALSE]
set.seed(456)
train_data <- price
train_data[] <- lapply(train_data, function(x) {
if (is.factor(x) | is.character(x)) {
return(as.numeric(as.factor(x)))
} else {
return(x)
}
})
# train_data[, -1] <- scale(train_data[, -1])
# emove the varibale more than 53
train_data <- train_data[, !(sapply(train_data, function(x) is.factor(x) && length(unique(x)) > 53))]
factor_cols <- sapply(train_data, is.factor)
train_data[factor_cols] <- lapply(train_data[factor_cols], function(x) as.numeric(as.character(x)))
train_data[is.na(train_data)] <- median(train_data$Unit.Price....Gallon., na.rm = TRUE)
# (Year, Month, Day)
train_data$Year <- as.numeric(substr(train_data$Report.Period, 1, 4))
train_data$Month <- as.numeric(substr(train_data$Report.Period, 6, 7))
train_data$Day <- as.numeric(substr(train_data$Report.Period, 9, 10))
train_data$Report.Period <- NULL # 删除原始日期列
# train_data[, -1] <- scale(train_data[, -1])
trainIndex <- sample(1:nrow(train_data), 0.8 * nrow(train_data))
train_set <- train_data[trainIndex, ]
test_set <- train_data[-trainIndex, ]
train_set[] <- lapply(train_set, function(x) if (is.character(x)) as.numeric(as.factor(x)) else x)
test_set[] <- lapply(test_set, function(x) if (is.character(x)) as.numeric(as.factor(x)) else x)
train_matrix <- as.matrix(train_set[, !colnames(train_set) %in% "Unit.Price....Gallon."])
test_matrix <- as.matrix(test_set[, !colnames(train_set) %in% "Unit.Price....Gallon."])
dtrain <- xgb.DMatrix(data = train_matrix, label = train_set$Unit.Price....Gallon.)
dtest <- xgb.DMatrix(data = test_matrix, label = test_set$Unit.Price....Gallon.)
params <- list(
objective = "reg:squarederror",
eta = 0.1,
max_depth = 6,
colsample_bytree = 0.5,
subsample = 0.8
)
model_xgb <- xgb.train(
params = params,
data = dtrain,
nrounds = 100,
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 10,
verbose = 0
)
predictions_xgb <- predict(model_xgb, dtest)
# R² RMSE
r2_xgb <- cor(test_set$Unit.Price....Gallon., predictions_xgb)^2
rmse_xgb <- sqrt(mean((test_set$Unit.Price....Gallon. - predictions_xgb)^2))
cat("📌 XGBoost 结果:\n")
## 📌 XGBoost 结果:
cat("✅ R²:", r2_xgb, "\n")
## ✅ R²: 0.1928907
cat("✅ RMSE:", rmse_xgb, "\n")
## ✅ RMSE: 0.3895091
importance_matrix <- xgb.importance(feature_names = colnames(train_matrix), model = model_xgb)
xgb.plot.importance(importance_matrix)

xgb.plot.importance(importance_matrix, top_n = 15) # 只显示前 15 个重要变量

# predict the price
predictions_xgb <- predict(model_xgb, dtest)
test_set$Predicted_Price <- predictions_xgb
head(test_set[, c("Unit.Price....Gallon.", "Predicted_Price")])
## # A tibble: 6 × 2
## Unit.Price....Gallon. Predicted_Price
## <dbl> <dbl>
## 1 3.77 3.36
## 2 3.49 3.03
## 3 3.81 3.53
## 4 3.65 3.54
## 5 2.99 3.26
## 6 3.83 3.46
library(ggplot2)
ggplot(test_set, aes(x = Unit.Price....Gallon., y = Predicted_Price)) +
geom_point(color = "blue", alpha = 0.6) +
geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
labs(title = "real vs predicted") +
theme_minimal()
