library(readxl)
library(tidyverse)  
library(caret)      
library(ggplot2)    
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)
library(randomForest)
library(xgboost)
library(GGally)
library(ggpubr)
library(nortest)
library(stringr)
library(writexl)
library(moments)
library(Metrics)
file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)
# make the names readable for R
colnames(price) <- make.names(colnames(price))
print(colnames(price))
##  [1] "Company.Name"                             
##  [2] "Business.Unit"                            
##  [3] "Report.Period"                            
##  [4] "Currency"                                 
##  [5] "Region"                                   
##  [6] "Transfer.Pricing.Method"                  
##  [7] "Benchmarking"                             
##  [8] "Transaction.Volume..Gallons."             
##  [9] "Unit.Price....Gallon."                    
## [10] "Market.Reference.Price....Gallon."        
## [11] "Trade.Terms"                              
## [12] "Crude.Oil.Purchase.Price....Barrel."      
## [13] "Refining.Cost....Gallon."                 
## [14] "Transport.Cost....Gallon."                
## [15] "Storage.Cost....Gallon."                  
## [16] "Pipeline.Tariffs....Gallon."              
## [17] "Terminal.Fees....Gallon."                 
## [18] "Operating.Expenses....Gallon."            
## [19] "Depreciation.Costs....Gallon."            
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"                 
## [22] "Market.Benchmark....Gallon."              
## [23] "Profit.Margin...."                        
## [24] "Risk.Adjustments....Gallon."              
## [25] "Net.Profit.Margin...."                    
## [26] "Return.on.Assets...."                     
## [27] "Competitor.Price....Gallon."              
## [28] "Regional.Supply.Demand.Trends"            
## [29] "Alternative.Energy.Impact"                
## [30] "Excise.Tax....Gallon."                    
## [31] "Value.Added.Tax...."                      
## [32] "Carbon.Emission.Tax....Gallon."           
## [33] "Market.Price.Volatility...."              
## [34] "Exchange.Rate.Risk...."                   
## [35] "Geopolitical.Risk"                        
## [36] "Refinery.Utilization.Rate...."            
## [37] "Processing.Capacity..Barrels.Day."        
## [38] "Yield.per.Barrel..Gallons."               
## [39] "Total.Transaction.Value...."              
## [40] "Refinery.Gross.Margin....Gallon."         
## [41] "Break.even.Price....Gallon."              
## [42] "Product.Name"
# count the varibale has only one level
unique_counts <- sapply(price, function(x) n_distinct(x, na.rm = TRUE))
one_level_vars <- names(unique_counts[unique_counts == 1])
print(one_level_vars)
## [1] "Company.Name"             "Business.Unit"           
## [3] "Currency"                 "Region"                  
## [5] "Transfer.Pricing.Method"  "Benchmarking"            
## [7] "Transfer.Pricing.Formula" "Product.Name"
# look for missing data
sum(is.na(price)) 
## [1] 0
#look for duplicate data
sum(duplicated(price))
## [1] 0
str(price)
## tibble [366 × 42] (S3: tbl_df/tbl/data.frame)
##  $ Company.Name                             : chr [1:366] "Global Energy Corp" "Global Energy Corp" "Global Energy Corp" "Global Energy Corp" ...
##  $ Business.Unit                            : chr [1:366] "Refining & Trading" "Refining & Trading" "Refining & Trading" "Refining & Trading" ...
##  $ Report.Period                            : chr [1:366] "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" ...
##  $ Currency                                 : chr [1:366] "USD" "USD" "USD" "USD" ...
##  $ Region                                   : chr [1:366] "Usa" "Usa" "Usa" "Usa" ...
##  $ Transfer.Pricing.Method                  : chr [1:366] "Cost-Plus Pricing" "Cost-Plus Pricing" "Cost-Plus Pricing" "Cost-Plus Pricing" ...
##  $ Benchmarking                             : chr [1:366] "Market Price Comparison" "Market Price Comparison" "Market Price Comparison" "Market Price Comparison" ...
##  $ Transaction.Volume..Gallons.             : num [1:366] 438352 184378 141337 43593 225908 ...
##  $ Unit.Price....Gallon.                    : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.        : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                              : chr [1:366] "CIF" "FOB" "CIF" "CIF" ...
##  $ Crude.Oil.Purchase.Price....Barrel.      : num [1:366] 78.8 67 88.1 70.9 64.7 ...
##  $ Refining.Cost....Gallon.                 : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
##  $ Transport.Cost....Gallon.                : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
##  $ Storage.Cost....Gallon.                  : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
##  $ Pipeline.Tariffs....Gallon.              : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
##  $ Terminal.Fees....Gallon.                 : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
##  $ Operating.Expenses....Gallon.            : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
##  $ Depreciation.Costs....Gallon.            : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
##  $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
##  $ Transfer.Pricing.Formula                 : chr [1:366] "Market Benchmark + Cost-Plus" "Market Benchmark + Cost-Plus" "Market Benchmark + Cost-Plus" "Market Benchmark + Cost-Plus" ...
##  $ Market.Benchmark....Gallon.              : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
##  $ Profit.Margin....                        : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
##  $ Risk.Adjustments....Gallon.              : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
##  $ Net.Profit.Margin....                    : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
##  $ Return.on.Assets....                     : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
##  $ Competitor.Price....Gallon.              : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
##  $ Regional.Supply.Demand.Trends            : chr [1:366] "Stable" "Stable" "Increasing Demand" "Stable" ...
##  $ Alternative.Energy.Impact                : chr [1:366] "High" "Moderate" "Moderate" "Moderate" ...
##  $ Excise.Tax....Gallon.                    : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
##  $ Value.Added.Tax....                      : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
##  $ Carbon.Emission.Tax....Gallon.           : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
##  $ Market.Price.Volatility....              : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
##  $ Exchange.Rate.Risk....                   : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                        : chr [1:366] "High" "High" "Low" "Medium" ...
##  $ Refinery.Utilization.Rate....            : num [1:366] 80 91.3 91.3 86.3 90.8 ...
##  $ Processing.Capacity..Barrels.Day.        : num [1:366] 138129 355755 397510 174968 317043 ...
##  $ Yield.per.Barrel..Gallons.               : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
##  $ Total.Transaction.Value....              : num [1:366] 1652587 731981 450865 169141 865228 ...
##  $ Refinery.Gross.Margin....Gallon.         : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
##  $ Break.even.Price....Gallon.              : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
##  $ Product.Name                             : chr [1:366] "Gasoline" "Gasoline" "Gasoline" "Gasoline" ...
#chagne chr to factor for modeleling 
price <- price %>% mutate_if(is.character, as.factor)
str(price)
## tibble [366 × 42] (S3: tbl_df/tbl/data.frame)
##  $ Company.Name                             : Factor w/ 1 level "Global Energy Corp": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Business.Unit                            : Factor w/ 1 level "Refining & Trading": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Report.Period                            : Factor w/ 366 levels "2024-01-01","2024-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Currency                                 : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Region                                   : Factor w/ 1 level "Usa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Transfer.Pricing.Method                  : Factor w/ 1 level "Cost-Plus Pricing": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Benchmarking                             : Factor w/ 1 level "Market Price Comparison": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Transaction.Volume..Gallons.             : num [1:366] 438352 184378 141337 43593 225908 ...
##  $ Unit.Price....Gallon.                    : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.        : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                              : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
##  $ Crude.Oil.Purchase.Price....Barrel.      : num [1:366] 78.8 67 88.1 70.9 64.7 ...
##  $ Refining.Cost....Gallon.                 : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
##  $ Transport.Cost....Gallon.                : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
##  $ Storage.Cost....Gallon.                  : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
##  $ Pipeline.Tariffs....Gallon.              : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
##  $ Terminal.Fees....Gallon.                 : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
##  $ Operating.Expenses....Gallon.            : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
##  $ Depreciation.Costs....Gallon.            : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
##  $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
##  $ Transfer.Pricing.Formula                 : Factor w/ 1 level "Market Benchmark + Cost-Plus": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Market.Benchmark....Gallon.              : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
##  $ Profit.Margin....                        : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
##  $ Risk.Adjustments....Gallon.              : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
##  $ Net.Profit.Margin....                    : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
##  $ Return.on.Assets....                     : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
##  $ Competitor.Price....Gallon.              : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
##  $ Regional.Supply.Demand.Trends            : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
##  $ Alternative.Energy.Impact                : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
##  $ Excise.Tax....Gallon.                    : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
##  $ Value.Added.Tax....                      : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
##  $ Carbon.Emission.Tax....Gallon.           : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
##  $ Market.Price.Volatility....              : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
##  $ Exchange.Rate.Risk....                   : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                        : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
##  $ Refinery.Utilization.Rate....            : num [1:366] 80 91.3 91.3 86.3 90.8 ...
##  $ Processing.Capacity..Barrels.Day.        : num [1:366] 138129 355755 397510 174968 317043 ...
##  $ Yield.per.Barrel..Gallons.               : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
##  $ Total.Transaction.Value....              : num [1:366] 1652587 731981 450865 169141 865228 ...
##  $ Refinery.Gross.Margin....Gallon.         : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
##  $ Break.even.Price....Gallon.              : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
##  $ Product.Name                             : Factor w/ 1 level "Gasoline": 1 1 1 1 1 1 1 1 1 1 ...
#data summary
summary(price)
##              Company.Name            Business.Unit    Report.Period Currency 
##  Global Energy Corp:366   Refining & Trading:366   2024-01-01:  1   USD:366  
##                                                    2024-01-02:  1            
##                                                    2024-01-03:  1            
##                                                    2024-01-04:  1            
##                                                    2024-01-05:  1            
##                                                    2024-01-06:  1            
##                                                    (Other)   :360            
##  Region         Transfer.Pricing.Method                  Benchmarking
##  Usa:366   Cost-Plus Pricing:366        Market Price Comparison:366  
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##                                                                      
##  Transaction.Volume..Gallons. Unit.Price....Gallon.
##  Min.   : 11043               Min.   :2.500        
##  1st Qu.:132563               1st Qu.:2.890        
##  Median :238005               Median :3.285        
##  Mean   :246641               Mean   :3.257        
##  3rd Qu.:366264               3rd Qu.:3.600        
##  Max.   :497593               Max.   :4.000        
##                                                    
##  Market.Reference.Price....Gallon. Trade.Terms
##  Min.   :2.510                     CIF:131    
##  1st Qu.:2.973                     DAP:103    
##  Median :3.395                     FOB:132    
##  Mean   :3.369                                
##  3rd Qu.:3.780                                
##  Max.   :4.200                                
##                                               
##  Crude.Oil.Purchase.Price....Barrel. Refining.Cost....Gallon.
##  Min.   :60.12                       Min.   :0.2000          
##  1st Qu.:66.25                       1st Qu.:0.2700          
##  Median :74.02                       Median :0.3400          
##  Mean   :74.39                       Mean   :0.3434          
##  3rd Qu.:82.21                       3rd Qu.:0.4200          
##  Max.   :89.88                       Max.   :0.5000          
##                                                              
##  Transport.Cost....Gallon. Storage.Cost....Gallon. Pipeline.Tariffs....Gallon.
##  Min.   :0.0500            Min.   :0.02000         Min.   :0.01000            
##  1st Qu.:0.0800            1st Qu.:0.04000         1st Qu.:0.02000            
##  Median :0.1000            Median :0.05000         Median :0.03000            
##  Mean   :0.1014            Mean   :0.04959         Mean   :0.03011            
##  3rd Qu.:0.1300            3rd Qu.:0.06000         3rd Qu.:0.04000            
##  Max.   :0.1500            Max.   :0.08000         Max.   :0.05000            
##                                                                               
##  Terminal.Fees....Gallon. Operating.Expenses....Gallon.
##  Min.   :0.02000          Min.   :0.1000               
##  1st Qu.:0.03000          1st Qu.:0.1500               
##  Median :0.04000          Median :0.2100               
##  Mean   :0.03975          Mean   :0.2015               
##  3rd Qu.:0.05000          3rd Qu.:0.2500               
##  Max.   :0.06000          Max.   :0.3000               
##                                                        
##  Depreciation.Costs....Gallon. Environmental.Compliance.Costs....Gallon.
##  Min.   :0.0500                Min.   :0.01000                          
##  1st Qu.:0.0800                1st Qu.:0.02000                          
##  Median :0.1200                Median :0.03000                          
##  Mean   :0.1240                Mean   :0.03005                          
##  3rd Qu.:0.1675                3rd Qu.:0.04000                          
##  Max.   :0.2000                Max.   :0.05000                          
##                                                                         
##                  Transfer.Pricing.Formula Market.Benchmark....Gallon.
##  Market Benchmark + Cost-Plus:366         Min.   :2.600              
##                                           1st Qu.:2.940              
##                                           Median :3.285              
##                                           Mean   :3.284              
##                                           3rd Qu.:3.660              
##                                           Max.   :4.000              
##                                                                      
##  Profit.Margin.... Risk.Adjustments....Gallon. Net.Profit.Margin....
##  Min.   : 5.040    Min.   :0.01000             Min.   :3.020        
##  1st Qu.: 7.293    1st Qu.:0.03000             1st Qu.:4.825        
##  Median : 9.575    Median :0.05000             Median :6.590        
##  Mean   : 9.783    Mean   :0.05413             Mean   :6.489        
##  3rd Qu.:11.985    3rd Qu.:0.08000             3rd Qu.:8.137        
##  Max.   :14.950    Max.   :0.10000             Max.   :9.960        
##                                                                     
##  Return.on.Assets.... Competitor.Price....Gallon.
##  Min.   :2.010        Min.   :2.510              
##  1st Qu.:3.442        1st Qu.:2.960              
##  Median :4.940        Median :3.340              
##  Mean   :4.949        Mean   :3.333              
##  3rd Qu.:6.410        3rd Qu.:3.720              
##  Max.   :7.990        Max.   :4.090              
##                                                  
##    Regional.Supply.Demand.Trends Alternative.Energy.Impact
##  Decreasing Demand:118           High    :117             
##  Increasing Demand:124           Low     :123             
##  Stable           :124           Moderate:126             
##                                                           
##                                                           
##                                                           
##                                                           
##  Excise.Tax....Gallon. Value.Added.Tax.... Carbon.Emission.Tax....Gallon.
##  Min.   :0.050         Min.   : 5.070      Min.   :0.01000               
##  1st Qu.:0.090         1st Qu.: 9.367      1st Qu.:0.03000               
##  Median :0.120         Median :13.160      Median :0.05000               
##  Mean   :0.123         Mean   :12.759      Mean   :0.05434               
##  3rd Qu.:0.160         3rd Qu.:16.137      3rd Qu.:0.08000               
##  Max.   :0.200         Max.   :19.900      Max.   :0.10000               
##                                                                          
##  Market.Price.Volatility.... Exchange.Rate.Risk.... Geopolitical.Risk
##  Min.   :1.040               Min.   :0.510          High  :123       
##  1st Qu.:3.103               1st Qu.:1.512          Low   :113       
##  Median :5.110               Median :2.660          Medium:130       
##  Mean   :5.416               Mean   :2.648                           
##  3rd Qu.:7.577               3rd Qu.:3.690                           
##  Max.   :9.980               Max.   :5.000                           
##                                                                      
##  Refinery.Utilization.Rate.... Processing.Capacity..Barrels.Day.
##  Min.   :75.03                 Min.   :100399                   
##  1st Qu.:80.32                 1st Qu.:197118                   
##  Median :85.64                 Median :297231                   
##  Mean   :85.37                 Mean   :295243                   
##  3rd Qu.:90.46                 3rd Qu.:392937                   
##  Max.   :94.98                 Max.   :499672                   
##                                                                 
##  Yield.per.Barrel..Gallons. Total.Transaction.Value....
##  Min.   :30.0               Min.   :  31141            
##  1st Qu.:33.0               1st Qu.: 421065            
##  Median :36.0               Median : 764578            
##  Mean   :35.5               Mean   : 805110            
##  3rd Qu.:39.0               3rd Qu.:1202052            
##  Max.   :41.0               Max.   :1946554            
##                                                        
##  Refinery.Gross.Margin....Gallon. Break.even.Price....Gallon.   Product.Name
##  Min.   :2.080                    Min.   :0.2500              Gasoline:366  
##  1st Qu.:2.603                    1st Qu.:0.3700                            
##  Median :3.065                    Median :0.4400                            
##  Mean   :3.025                    Mean   :0.4449                            
##  3rd Qu.:3.428                    3rd Qu.:0.5200                            
##  Max.   :3.970                    Max.   :0.6500                            
## 
# look for distinct of each variable
numeric_vars <- price[, sapply(price, is.numeric), drop = FALSE]
num_unique_values <- sapply(numeric_vars, n_distinct)
print(num_unique_values)
##              Transaction.Volume..Gallons. 
##                                       365 
##                     Unit.Price....Gallon. 
##                                       132 
##         Market.Reference.Price....Gallon. 
##                                       153 
##       Crude.Oil.Purchase.Price....Barrel. 
##                                       348 
##                  Refining.Cost....Gallon. 
##                                        31 
##                 Transport.Cost....Gallon. 
##                                        11 
##                   Storage.Cost....Gallon. 
##                                         7 
##               Pipeline.Tariffs....Gallon. 
##                                         5 
##                  Terminal.Fees....Gallon. 
##                                         5 
##             Operating.Expenses....Gallon. 
##                                        21 
##             Depreciation.Costs....Gallon. 
##                                        16 
## Environmental.Compliance.Costs....Gallon. 
##                                         5 
##               Market.Benchmark....Gallon. 
##                                       133 
##                         Profit.Margin.... 
##                                       303 
##               Risk.Adjustments....Gallon. 
##                                        10 
##                     Net.Profit.Margin.... 
##                                       285 
##                      Return.on.Assets.... 
##                                       265 
##               Competitor.Price....Gallon. 
##                                       143 
##                     Excise.Tax....Gallon. 
##                                        16 
##                       Value.Added.Tax.... 
##                                       325 
##            Carbon.Emission.Tax....Gallon. 
##                                        10 
##               Market.Price.Volatility.... 
##                                       297 
##                    Exchange.Rate.Risk.... 
##                                       253 
##             Refinery.Utilization.Rate.... 
##                                       333 
##         Processing.Capacity..Barrels.Day. 
##                                       366 
##                Yield.per.Barrel..Gallons. 
##                                        12 
##               Total.Transaction.Value.... 
##                                       366 
##          Refinery.Gross.Margin....Gallon. 
##                                       154 
##               Break.even.Price....Gallon. 
##                                        48
# Convert columns with <10 unique values to factors
price <- as.data.frame(lapply(price, function(x) if (length(unique(x)) < 10) as.factor(x) else x))

# Check structure
str(price)
## 'data.frame':    366 obs. of  42 variables:
##  $ Company.Name                             : Factor w/ 1 level "Global Energy Corp": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Business.Unit                            : Factor w/ 1 level "Refining & Trading": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Report.Period                            : Factor w/ 366 levels "2024-01-01","2024-01-02",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ Currency                                 : Factor w/ 1 level "USD": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Region                                   : Factor w/ 1 level "Usa": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Transfer.Pricing.Method                  : Factor w/ 1 level "Cost-Plus Pricing": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Benchmarking                             : Factor w/ 1 level "Market Price Comparison": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Transaction.Volume..Gallons.             : num  438352 184378 141337 43593 225908 ...
##  $ Unit.Price....Gallon.                    : num  3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.        : num  2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                              : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
##  $ Crude.Oil.Purchase.Price....Barrel.      : num  78.8 67 88.1 70.9 64.7 ...
##  $ Refining.Cost....Gallon.                 : num  0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
##  $ Transport.Cost....Gallon.                : num  0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
##  $ Storage.Cost....Gallon.                  : Factor w/ 7 levels "0.02","0.03",..: 4 6 3 2 4 4 4 6 6 6 ...
##  $ Pipeline.Tariffs....Gallon.              : Factor w/ 5 levels "0.01","0.02",..: 5 1 1 4 3 4 2 4 1 3 ...
##  $ Terminal.Fees....Gallon.                 : Factor w/ 5 levels "0.02","0.03",..: 5 5 3 3 1 5 2 2 4 2 ...
##  $ Operating.Expenses....Gallon.            : num  0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
##  $ Depreciation.Costs....Gallon.            : num  0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
##  $ Environmental.Compliance.Costs....Gallon.: Factor w/ 5 levels "0.01","0.02",..: 3 3 2 4 5 2 5 3 2 1 ...
##  $ Transfer.Pricing.Formula                 : Factor w/ 1 level "Market Benchmark + Cost-Plus": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Market.Benchmark....Gallon.              : num  3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
##  $ Profit.Margin....                        : num  11.94 12.13 6.39 13.06 13.81 ...
##  $ Risk.Adjustments....Gallon.              : num  0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
##  $ Net.Profit.Margin....                    : num  4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
##  $ Return.on.Assets....                     : num  5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
##  $ Competitor.Price....Gallon.              : num  3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
##  $ Regional.Supply.Demand.Trends            : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
##  $ Alternative.Energy.Impact                : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
##  $ Excise.Tax....Gallon.                    : num  0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
##  $ Value.Added.Tax....                      : num  17.6 11.42 9.4 10.62 7.86 ...
##  $ Carbon.Emission.Tax....Gallon.           : num  0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
##  $ Market.Price.Volatility....              : num  2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
##  $ Exchange.Rate.Risk....                   : num  3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                        : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
##  $ Refinery.Utilization.Rate....            : num  80 91.3 91.3 86.3 90.8 ...
##  $ Processing.Capacity..Barrels.Day.        : num  138129 355755 397510 174968 317043 ...
##  $ Yield.per.Barrel..Gallons.               : num  33 31 35 34 34 34 30 32 36 32 ...
##  $ Total.Transaction.Value....              : num  1652587 731981 450865 169141 865228 ...
##  $ Refinery.Gross.Margin....Gallon.         : num  2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
##  $ Break.even.Price....Gallon.              : num  0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
##  $ Product.Name                             : Factor w/ 1 level "Gasoline": 1 1 1 1 1 1 1 1 1 1 ...
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.character), drop = FALSE]
numeric <- numeric %>%
  rename_with(~ str_wrap(.x, width = 20))  

numeric %>%
  gather(key = "Variable", value = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(color = "blue", size = 0.5) +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
   theme(strip.text = element_text(size = 6)) +
  ggtitle("Histogram and Density Plot of Numeric Variables")

# format the plot
numeric <- numeric %>%
  rename_with(~ stringr::str_wrap(.x, width = 20))

numeric <- numeric %>%
  gather(key = "Variable", value = "Value")

#  Q-Q Plot
ggplot(numeric, aes(sample = Value)) +
  stat_qq() +
  stat_qq_line(color = "red") +  
  facet_wrap(~Variable, scales = "free") + 
  theme_minimal() +
  theme(strip.text = element_text(size = 6)) +
  ggtitle("Q-Q Plot of Numeric Variables")

# look for num outliers
numeric_vars <- price %>% dplyr::select(where(is.numeric))

outliers <- lapply(numeric_vars, function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  which(x < lower_bound | x > upper_bound)  
})

outliers
## $Transaction.Volume..Gallons.
## integer(0)
## 
## $Unit.Price....Gallon.
## integer(0)
## 
## $Market.Reference.Price....Gallon.
## integer(0)
## 
## $Crude.Oil.Purchase.Price....Barrel.
## integer(0)
## 
## $Refining.Cost....Gallon.
## integer(0)
## 
## $Transport.Cost....Gallon.
## integer(0)
## 
## $Operating.Expenses....Gallon.
## integer(0)
## 
## $Depreciation.Costs....Gallon.
## integer(0)
## 
## $Market.Benchmark....Gallon.
## integer(0)
## 
## $Profit.Margin....
## integer(0)
## 
## $Risk.Adjustments....Gallon.
## integer(0)
## 
## $Net.Profit.Margin....
## integer(0)
## 
## $Return.on.Assets....
## integer(0)
## 
## $Competitor.Price....Gallon.
## integer(0)
## 
## $Excise.Tax....Gallon.
## integer(0)
## 
## $Value.Added.Tax....
## integer(0)
## 
## $Carbon.Emission.Tax....Gallon.
## integer(0)
## 
## $Market.Price.Volatility....
## integer(0)
## 
## $Exchange.Rate.Risk....
## integer(0)
## 
## $Refinery.Utilization.Rate....
## integer(0)
## 
## $Processing.Capacity..Barrels.Day.
## integer(0)
## 
## $Yield.per.Barrel..Gallons.
## integer(0)
## 
## $Total.Transaction.Value....
## integer(0)
## 
## $Refinery.Gross.Margin....Gallon.
## integer(0)
## 
## $Break.even.Price....Gallon.
## integer(0)
numeric <- price[, sapply(price, is.numeric), drop = FALSE]

# look for the for the correlation between num and y
cor_results <- cor(numeric, use = "complete.obs")  
cor_results["Unit.Price....Gallon.", ]  
##        Transaction.Volume..Gallons.               Unit.Price....Gallon. 
##                         0.031694613                         1.000000000 
##   Market.Reference.Price....Gallon. Crude.Oil.Purchase.Price....Barrel. 
##                         0.127159227                         0.019416446 
##            Refining.Cost....Gallon.           Transport.Cost....Gallon. 
##                        -0.059677350                         0.032534250 
##       Operating.Expenses....Gallon.       Depreciation.Costs....Gallon. 
##                         0.047817560                        -0.050515793 
##         Market.Benchmark....Gallon.                   Profit.Margin.... 
##                        -0.046882739                        -0.076025599 
##         Risk.Adjustments....Gallon.               Net.Profit.Margin.... 
##                        -0.054379813                        -0.078125814 
##                Return.on.Assets....         Competitor.Price....Gallon. 
##                         0.008796997                         0.083188138 
##               Excise.Tax....Gallon.                 Value.Added.Tax.... 
##                        -0.037687003                         0.045761039 
##      Carbon.Emission.Tax....Gallon.         Market.Price.Volatility.... 
##                        -0.004259401                        -0.020275851 
##              Exchange.Rate.Risk....       Refinery.Utilization.Rate.... 
##                         0.103598985                         0.048157393 
##   Processing.Capacity..Barrels.Day.          Yield.per.Barrel..Gallons. 
##                        -0.018505055                         0.107334769 
##         Total.Transaction.Value....    Refinery.Gross.Margin....Gallon. 
##                         0.253527157                         0.136125104 
##         Break.even.Price....Gallon. 
##                        -0.045477075
#cor > 0.5 →positive relation, price go up when ind bigger
#cor < -0.5 → neg relation, price go down when ind bigger
#cor ≈ 0 → no correlation
#remove one level factor
factor <- price[, sapply(price, is.character), drop = FALSE] 
unique_counts <- sapply(factor, n_distinct)   

one_level_factors <- names(unique_counts[unique_counts == 1])
print(one_level_factors)
## character(0)
dim(price)  
## [1] 366  42
#correlation look for correlation between num
numeric <- price[, sapply(price, is.numeric), drop = FALSE]

cor_matrix <- cor(numeric_vars, use = "complete.obs")

#  (|cor| > 0.7)
high_cor_pairs <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)

high_cor_df <- data.frame(
  Variable_1 = rownames(cor_matrix)[high_cor_pairs[, 1]],
  Variable_2 = colnames(cor_matrix)[high_cor_pairs[, 2]],
  Correlation = cor_matrix[high_cor_pairs]
)

high_cor_df <- high_cor_df %>%
  distinct(Variable_1, Variable_2, .keep_all = TRUE)

print(high_cor_df)
##                          Variable_1                        Variable_2
## 1       Total.Transaction.Value....      Transaction.Volume..Gallons.
## 2  Refinery.Gross.Margin....Gallon. Market.Reference.Price....Gallon.
## 3       Break.even.Price....Gallon.          Refining.Cost....Gallon.
## 4      Transaction.Volume..Gallons.       Total.Transaction.Value....
## 5 Market.Reference.Price....Gallon.  Refinery.Gross.Margin....Gallon.
## 6          Refining.Cost....Gallon.       Break.even.Price....Gallon.
##   Correlation
## 1   0.9667413
## 2   0.9841007
## 3   0.9452115
## 4   0.9667413
## 5   0.9841007
## 6   0.9452115
#delete the high correlated
price <- price[, setdiff(names(price), c("Total.Transaction.Value....", 
                                         "Refinery.Gross.Margin....Gallon.", 
                                         "Break.even.Price....Gallon.")), drop = FALSE]
names(price)
##  [1] "Company.Name"                             
##  [2] "Business.Unit"                            
##  [3] "Report.Period"                            
##  [4] "Currency"                                 
##  [5] "Region"                                   
##  [6] "Transfer.Pricing.Method"                  
##  [7] "Benchmarking"                             
##  [8] "Transaction.Volume..Gallons."             
##  [9] "Unit.Price....Gallon."                    
## [10] "Market.Reference.Price....Gallon."        
## [11] "Trade.Terms"                              
## [12] "Crude.Oil.Purchase.Price....Barrel."      
## [13] "Refining.Cost....Gallon."                 
## [14] "Transport.Cost....Gallon."                
## [15] "Storage.Cost....Gallon."                  
## [16] "Pipeline.Tariffs....Gallon."              
## [17] "Terminal.Fees....Gallon."                 
## [18] "Operating.Expenses....Gallon."            
## [19] "Depreciation.Costs....Gallon."            
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"                 
## [22] "Market.Benchmark....Gallon."              
## [23] "Profit.Margin...."                        
## [24] "Risk.Adjustments....Gallon."              
## [25] "Net.Profit.Margin...."                    
## [26] "Return.on.Assets...."                     
## [27] "Competitor.Price....Gallon."              
## [28] "Regional.Supply.Demand.Trends"            
## [29] "Alternative.Energy.Impact"                
## [30] "Excise.Tax....Gallon."                    
## [31] "Value.Added.Tax...."                      
## [32] "Carbon.Emission.Tax....Gallon."           
## [33] "Market.Price.Volatility...."              
## [34] "Exchange.Rate.Risk...."                   
## [35] "Geopolitical.Risk"                        
## [36] "Refinery.Utilization.Rate...."            
## [37] "Processing.Capacity..Barrels.Day."        
## [38] "Yield.per.Barrel..Gallons."               
## [39] "Product.Name"
#build linear regression model
price <- as.data.frame(price) %>%
  mutate(across(where(is.character), as.factor))

set.seed(5678)
train_index <- sample(1:nrow(price), 0.8 * nrow(price))
train_data <- price[train_index, ]
test_data <- price[-train_index, ]

factor_cols <- names(train_data)[sapply(train_data, is.factor)]
single_level_factors <- factor_cols[sapply(train_data[factor_cols], function(x) length(unique(x)) == 1)]

if (length(single_level_factors) > 0) {
  train_data <- train_data[, !(names(train_data) %in% single_level_factors)]
}

# remove  Report.Period

train_data <- train_data %>% dplyr::select(-Report.Period)
test_data <- test_data %>% dplyr::select(-Report.Period)

lm_model <- lm(`Unit.Price....Gallon.` ~ ., data = train_data)

train_preds <- predict(lm_model, newdata = train_data)
test_preds <- predict(lm_model, newdata = test_data)

#  MSE
mse_train <- mean((train_data$`Unit.Price....Gallon.` - train_preds)^2)
mse_test <- mean((test_data$`Unit.Price....Gallon.` - test_preds)^2)

#  RMSE
rmse_train <- sqrt(mse_train)
rmse_test <- sqrt(mse_test)

#  R²
r_squared_train <- summary(lm_model)$r.squared
r_squared_test <- 1 - sum((test_preds - test_data$`Unit.Price....Gallon.`)^2) / 
                      sum((test_data$`Unit.Price....Gallon.` - mean(test_data$`Unit.Price....Gallon.`))^2)


cat("📌 **模型评估结果**\n")
## 📌 **模型评估结果**
cat("✅  MSE:", mse_train, "\n")
## ✅  MSE: 0.1510838
cat("✅  MSE:", mse_test, "\n")
## ✅  MSE: 0.1711638
cat("✅  RMSE:", rmse_train, "\n")
## ✅  RMSE: 0.388695
cat("✅  RMSE:", rmse_test, "\n")
## ✅  RMSE: 0.4137195
cat("✅  R²:", r_squared_train, "\n")
## ✅  R²: 0.1716628
cat("✅  R²:", r_squared_test, "\n")
## ✅  R²: -0.008538477
# second model random forest

file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)

colnames(price) <- make.names(colnames(price))
print(colnames(price))
##  [1] "Company.Name"                             
##  [2] "Business.Unit"                            
##  [3] "Report.Period"                            
##  [4] "Currency"                                 
##  [5] "Region"                                   
##  [6] "Transfer.Pricing.Method"                  
##  [7] "Benchmarking"                             
##  [8] "Transaction.Volume..Gallons."             
##  [9] "Unit.Price....Gallon."                    
## [10] "Market.Reference.Price....Gallon."        
## [11] "Trade.Terms"                              
## [12] "Crude.Oil.Purchase.Price....Barrel."      
## [13] "Refining.Cost....Gallon."                 
## [14] "Transport.Cost....Gallon."                
## [15] "Storage.Cost....Gallon."                  
## [16] "Pipeline.Tariffs....Gallon."              
## [17] "Terminal.Fees....Gallon."                 
## [18] "Operating.Expenses....Gallon."            
## [19] "Depreciation.Costs....Gallon."            
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"                 
## [22] "Market.Benchmark....Gallon."              
## [23] "Profit.Margin...."                        
## [24] "Risk.Adjustments....Gallon."              
## [25] "Net.Profit.Margin...."                    
## [26] "Return.on.Assets...."                     
## [27] "Competitor.Price....Gallon."              
## [28] "Regional.Supply.Demand.Trends"            
## [29] "Alternative.Energy.Impact"                
## [30] "Excise.Tax....Gallon."                    
## [31] "Value.Added.Tax...."                      
## [32] "Carbon.Emission.Tax....Gallon."           
## [33] "Market.Price.Volatility...."              
## [34] "Exchange.Rate.Risk...."                   
## [35] "Geopolitical.Risk"                        
## [36] "Refinery.Utilization.Rate...."            
## [37] "Processing.Capacity..Barrels.Day."        
## [38] "Yield.per.Barrel..Gallons."               
## [39] "Total.Transaction.Value...."              
## [40] "Refinery.Gross.Margin....Gallon."         
## [41] "Break.even.Price....Gallon."              
## [42] "Product.Name"
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.character), drop = FALSE]


set.seed(456)
trainIndex <- sample(1:nrow(price), 0.8 * nrow(price))
train_data <- price[trainIndex, ]
test_data <- price[-trainIndex, ]

train_data <- train_data[, !(sapply(train_data, function(x) is.factor(x) && length(unique(x)) > 53))]

model_rf <- randomForest(Unit.Price....Gallon. ~ ., data = train_data, ntree = 500, importance = TRUE)

predictions_rf <- predict(model_rf, test_data)

# R²  RMSE
r2_rf <- cor(test_data$Unit.Price....Gallon., predictions_rf)^2
rmse_rf <- sqrt(mean((test_data$Unit.Price....Gallon. - predictions_rf)^2))

# 
cat("📌 Random Forest 结果:\n")
## 📌 Random Forest 结果:
cat("✅ R²:", r2_rf, "\n")
## ✅ R²: 0.1568565
cat("✅ RMSE:", rmse_rf, "\n")
## ✅ RMSE: 0.409466
importance(model_rf)
##                                               %IncMSE IncNodePurity
## Company.Name                               0.00000000     0.0000000
## Business.Unit                              0.00000000     0.0000000
## Report.Period                             -1.92583655     1.6338530
## Currency                                   0.00000000     0.0000000
## Region                                     0.00000000     0.0000000
## Transfer.Pricing.Method                    0.00000000     0.0000000
## Benchmarking                               0.00000000     0.0000000
## Transaction.Volume..Gallons.              13.59693177     3.7134133
## Market.Reference.Price....Gallon.          3.61228469     1.8830174
## Trade.Terms                                0.27723692     0.3111749
## Crude.Oil.Purchase.Price....Barrel.        0.79436861     2.1062521
## Refining.Cost....Gallon.                   0.61891180     1.1979896
## Transport.Cost....Gallon.                  0.35355994     1.1265967
## Storage.Cost....Gallon.                   -1.82136601     0.6494871
## Pipeline.Tariffs....Gallon.                0.06568102     0.5803790
## Terminal.Fees....Gallon.                   1.84934412     0.7097186
## Operating.Expenses....Gallon.              1.97593228     1.1786858
## Depreciation.Costs....Gallon.              1.75166526     1.2959349
## Environmental.Compliance.Costs....Gallon. -1.29428093     0.4902236
## Transfer.Pricing.Formula                   0.00000000     0.0000000
## Market.Benchmark....Gallon.               -0.96554040     1.5845444
## Profit.Margin....                         -0.30867093     1.7417089
## Risk.Adjustments....Gallon.                0.23128235     0.8994355
## Net.Profit.Margin....                      1.81687847     2.2295281
## Return.on.Assets....                       0.97520885     1.4943019
## Competitor.Price....Gallon.                1.40757782     1.5575766
## Regional.Supply.Demand.Trends              1.96960141     0.4883561
## Alternative.Energy.Impact                 -1.33935989     0.3630602
## Excise.Tax....Gallon.                      0.51935085     1.3067210
## Value.Added.Tax....                       -1.39098930     1.8127782
## Carbon.Emission.Tax....Gallon.            -0.77386478     0.9607585
## Market.Price.Volatility....               -2.74123895     1.4416808
## Exchange.Rate.Risk....                     1.25315920     1.6983066
## Geopolitical.Risk                         -0.64811050     0.3381729
## Refinery.Utilization.Rate....              0.60920090     1.7349602
## Processing.Capacity..Barrels.Day.          0.58056990     1.9898112
## Yield.per.Barrel..Gallons.                 0.32233402     1.2043611
## Total.Transaction.Value....               23.87195099     6.8260299
## Refinery.Gross.Margin....Gallon.           5.27824505     2.0006795
## Break.even.Price....Gallon.                1.78227975     1.3484866
## Product.Name                               0.00000000     0.0000000
varImpPlot(model_rf)

importance(model_rf)  
##                                               %IncMSE IncNodePurity
## Company.Name                               0.00000000     0.0000000
## Business.Unit                              0.00000000     0.0000000
## Report.Period                             -1.92583655     1.6338530
## Currency                                   0.00000000     0.0000000
## Region                                     0.00000000     0.0000000
## Transfer.Pricing.Method                    0.00000000     0.0000000
## Benchmarking                               0.00000000     0.0000000
## Transaction.Volume..Gallons.              13.59693177     3.7134133
## Market.Reference.Price....Gallon.          3.61228469     1.8830174
## Trade.Terms                                0.27723692     0.3111749
## Crude.Oil.Purchase.Price....Barrel.        0.79436861     2.1062521
## Refining.Cost....Gallon.                   0.61891180     1.1979896
## Transport.Cost....Gallon.                  0.35355994     1.1265967
## Storage.Cost....Gallon.                   -1.82136601     0.6494871
## Pipeline.Tariffs....Gallon.                0.06568102     0.5803790
## Terminal.Fees....Gallon.                   1.84934412     0.7097186
## Operating.Expenses....Gallon.              1.97593228     1.1786858
## Depreciation.Costs....Gallon.              1.75166526     1.2959349
## Environmental.Compliance.Costs....Gallon. -1.29428093     0.4902236
## Transfer.Pricing.Formula                   0.00000000     0.0000000
## Market.Benchmark....Gallon.               -0.96554040     1.5845444
## Profit.Margin....                         -0.30867093     1.7417089
## Risk.Adjustments....Gallon.                0.23128235     0.8994355
## Net.Profit.Margin....                      1.81687847     2.2295281
## Return.on.Assets....                       0.97520885     1.4943019
## Competitor.Price....Gallon.                1.40757782     1.5575766
## Regional.Supply.Demand.Trends              1.96960141     0.4883561
## Alternative.Energy.Impact                 -1.33935989     0.3630602
## Excise.Tax....Gallon.                      0.51935085     1.3067210
## Value.Added.Tax....                       -1.39098930     1.8127782
## Carbon.Emission.Tax....Gallon.            -0.77386478     0.9607585
## Market.Price.Volatility....               -2.74123895     1.4416808
## Exchange.Rate.Risk....                     1.25315920     1.6983066
## Geopolitical.Risk                         -0.64811050     0.3381729
## Refinery.Utilization.Rate....              0.60920090     1.7349602
## Processing.Capacity..Barrels.Day.          0.58056990     1.9898112
## Yield.per.Barrel..Gallons.                 0.32233402     1.2043611
## Total.Transaction.Value....               23.87195099     6.8260299
## Refinery.Gross.Margin....Gallon.           5.27824505     2.0006795
## Break.even.Price....Gallon.                1.78227975     1.3484866
## Product.Name                               0.00000000     0.0000000
#Model 3 XGBoost

file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)

colnames(price) <- make.names(colnames(price))
print(colnames(price))
##  [1] "Company.Name"                             
##  [2] "Business.Unit"                            
##  [3] "Report.Period"                            
##  [4] "Currency"                                 
##  [5] "Region"                                   
##  [6] "Transfer.Pricing.Method"                  
##  [7] "Benchmarking"                             
##  [8] "Transaction.Volume..Gallons."             
##  [9] "Unit.Price....Gallon."                    
## [10] "Market.Reference.Price....Gallon."        
## [11] "Trade.Terms"                              
## [12] "Crude.Oil.Purchase.Price....Barrel."      
## [13] "Refining.Cost....Gallon."                 
## [14] "Transport.Cost....Gallon."                
## [15] "Storage.Cost....Gallon."                  
## [16] "Pipeline.Tariffs....Gallon."              
## [17] "Terminal.Fees....Gallon."                 
## [18] "Operating.Expenses....Gallon."            
## [19] "Depreciation.Costs....Gallon."            
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"                 
## [22] "Market.Benchmark....Gallon."              
## [23] "Profit.Margin...."                        
## [24] "Risk.Adjustments....Gallon."              
## [25] "Net.Profit.Margin...."                    
## [26] "Return.on.Assets...."                     
## [27] "Competitor.Price....Gallon."              
## [28] "Regional.Supply.Demand.Trends"            
## [29] "Alternative.Energy.Impact"                
## [30] "Excise.Tax....Gallon."                    
## [31] "Value.Added.Tax...."                      
## [32] "Carbon.Emission.Tax....Gallon."           
## [33] "Market.Price.Volatility...."              
## [34] "Exchange.Rate.Risk...."                   
## [35] "Geopolitical.Risk"                        
## [36] "Refinery.Utilization.Rate...."            
## [37] "Processing.Capacity..Barrels.Day."        
## [38] "Yield.per.Barrel..Gallons."               
## [39] "Total.Transaction.Value...."              
## [40] "Refinery.Gross.Margin....Gallon."         
## [41] "Break.even.Price....Gallon."              
## [42] "Product.Name"
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.character), drop = FALSE]
set.seed(456) 

train_data <- price  
train_data[] <- lapply(train_data, function(x) {
  if (is.factor(x) | is.character(x)) {
    return(as.numeric(as.factor(x)))
  } else {
    return(x)
  }
})
#   train_data[, -1] <- scale(train_data[, -1])

# emove the varibale more than 53
train_data <- train_data[, !(sapply(train_data, function(x) is.factor(x) && length(unique(x)) > 53))]

factor_cols <- sapply(train_data, is.factor)
train_data[factor_cols] <- lapply(train_data[factor_cols], function(x) as.numeric(as.character(x)))

train_data[is.na(train_data)] <- median(train_data$Unit.Price....Gallon., na.rm = TRUE)

# (Year, Month, Day)
train_data$Year <- as.numeric(substr(train_data$Report.Period, 1, 4))
train_data$Month <- as.numeric(substr(train_data$Report.Period, 6, 7))
train_data$Day <- as.numeric(substr(train_data$Report.Period, 9, 10))
train_data$Report.Period <- NULL  # 删除原始日期列

#   train_data[, -1] <- scale(train_data[, -1])

trainIndex <- sample(1:nrow(train_data), 0.8 * nrow(train_data))
train_set <- train_data[trainIndex, ]
test_set <- train_data[-trainIndex, ]

train_set[] <- lapply(train_set, function(x) if (is.character(x)) as.numeric(as.factor(x)) else x)
test_set[] <- lapply(test_set, function(x) if (is.character(x)) as.numeric(as.factor(x)) else x)

train_matrix <- as.matrix(train_set[, !colnames(train_set) %in% "Unit.Price....Gallon."]) 
test_matrix <- as.matrix(test_set[, !colnames(train_set) %in% "Unit.Price....Gallon."]) 

dtrain <- xgb.DMatrix(data = train_matrix, label = train_set$Unit.Price....Gallon.)
dtest <- xgb.DMatrix(data = test_matrix, label = test_set$Unit.Price....Gallon.)

params <- list(
  objective = "reg:squarederror",
  eta = 0.1,  
  max_depth = 6,  
  colsample_bytree = 0.5,  
  subsample = 0.8  
)

model_xgb <- xgb.train(
  params = params,
  data = dtrain,
  nrounds = 100,  
  watchlist = list(train = dtrain, test = dtest),
  early_stopping_rounds = 10,  
  verbose = 0
)

predictions_xgb <- predict(model_xgb, dtest)

# R²  RMSE
r2_xgb <- cor(test_set$Unit.Price....Gallon., predictions_xgb)^2
rmse_xgb <- sqrt(mean((test_set$Unit.Price....Gallon. - predictions_xgb)^2))


cat("📌 XGBoost 结果:\n")
## 📌 XGBoost 结果:
cat("✅ R²:", r2_xgb, "\n")
## ✅ R²: 0.1928907
cat("✅ RMSE:", rmse_xgb, "\n")
## ✅ RMSE: 0.3895091
importance_matrix <- xgb.importance(feature_names = colnames(train_matrix), model = model_xgb)
xgb.plot.importance(importance_matrix)

xgb.plot.importance(importance_matrix, top_n = 15)  # 只显示前 15 个重要变量

# predict the price
predictions_xgb <- predict(model_xgb, dtest)
test_set$Predicted_Price <- predictions_xgb

head(test_set[, c("Unit.Price....Gallon.", "Predicted_Price")])
## # A tibble: 6 × 2
##   Unit.Price....Gallon. Predicted_Price
##                   <dbl>           <dbl>
## 1                  3.77            3.36
## 2                  3.49            3.03
## 3                  3.81            3.53
## 4                  3.65            3.54
## 5                  2.99            3.26
## 6                  3.83            3.46
library(ggplot2)

ggplot(test_set, aes(x = Unit.Price....Gallon., y = Predicted_Price)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_abline(intercept = 0, slope = 1, color = "red", linetype = "dashed") +
  labs(title = "real  vs predicted") +
  theme_minimal()