library(readxl)
library(tidyverse)  
library(caret)      
library(ggplot2)    
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)
library(randomForest)
library(xgboost)
library(GGally)
library(ggpubr)
library(nortest)
library(stringr)
library(writexl)
library(moments)
library(Metrics)
file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)
# make the names readable for R
colnames(price) <- make.names(colnames(price))
print(colnames(price))
##  [1] "Company.Name"                             
##  [2] "Business.Unit"                            
##  [3] "Report.Period"                            
##  [4] "Currency"                                 
##  [5] "Region"                                   
##  [6] "Transfer.Pricing.Method"                  
##  [7] "Benchmarking"                             
##  [8] "Transaction.Volume..Gallons."             
##  [9] "Unit.Price....Gallon."                    
## [10] "Market.Reference.Price....Gallon."        
## [11] "Trade.Terms"                              
## [12] "Crude.Oil.Purchase.Price....Barrel."      
## [13] "Refining.Cost....Gallon."                 
## [14] "Transport.Cost....Gallon."                
## [15] "Storage.Cost....Gallon."                  
## [16] "Pipeline.Tariffs....Gallon."              
## [17] "Terminal.Fees....Gallon."                 
## [18] "Operating.Expenses....Gallon."            
## [19] "Depreciation.Costs....Gallon."            
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"                 
## [22] "Market.Benchmark....Gallon."              
## [23] "Profit.Margin...."                        
## [24] "Risk.Adjustments....Gallon."              
## [25] "Net.Profit.Margin...."                    
## [26] "Return.on.Assets...."                     
## [27] "Competitor.Price....Gallon."              
## [28] "Regional.Supply.Demand.Trends"            
## [29] "Alternative.Energy.Impact"                
## [30] "Excise.Tax....Gallon."                    
## [31] "Value.Added.Tax...."                      
## [32] "Carbon.Emission.Tax....Gallon."           
## [33] "Market.Price.Volatility...."              
## [34] "Exchange.Rate.Risk...."                   
## [35] "Geopolitical.Risk"                        
## [36] "Refinery.Utilization.Rate...."            
## [37] "Processing.Capacity..Barrels.Day."        
## [38] "Yield.per.Barrel..Gallons."               
## [39] "Total.Transaction.Value...."              
## [40] "Refinery.Gross.Margin....Gallon."         
## [41] "Break.even.Price....Gallon."              
## [42] "Product.Name"
# count the varibale has only one level
unique_counts <- sapply(price, function(x) n_distinct(x, na.rm = TRUE))
one_level_vars <- names(unique_counts[unique_counts == 1])
print(one_level_vars)
## [1] "Company.Name"             "Business.Unit"           
## [3] "Currency"                 "Region"                  
## [5] "Transfer.Pricing.Method"  "Benchmarking"            
## [7] "Transfer.Pricing.Formula" "Product.Name"
price <- price %>% select(-one_level_vars)
print(price)
## # A tibble: 366 × 34
##    Report.Period Transaction.Volume..Gallons. Unit.Price....Gallon.
##    <chr>                                <dbl>                 <dbl>
##  1 2024-01-01                          438352                  3.77
##  2 2024-01-02                          184378                  3.97
##  3 2024-01-03                          141337                  3.19
##  4 2024-01-04                           43593                  3.88
##  5 2024-01-05                          225908                  3.83
##  6 2024-01-06                          167237                  3.49
##  7 2024-01-07                          330267                  2.68
##  8 2024-01-08                          238738                  3.49
##  9 2024-01-09                          138323                  3.65
## 10 2024-01-10                          305710                  3.64
## # ℹ 356 more rows
## # ℹ 31 more variables: Market.Reference.Price....Gallon. <dbl>,
## #   Trade.Terms <chr>, Crude.Oil.Purchase.Price....Barrel. <dbl>,
## #   Refining.Cost....Gallon. <dbl>, Transport.Cost....Gallon. <dbl>,
## #   Storage.Cost....Gallon. <dbl>, Pipeline.Tariffs....Gallon. <dbl>,
## #   Terminal.Fees....Gallon. <dbl>, Operating.Expenses....Gallon. <dbl>,
## #   Depreciation.Costs....Gallon. <dbl>, …
# look for missing data
sum(is.na(price)) 
## [1] 0
#look for duplicate data
sum(duplicated(price))
## [1] 0
str(price)
## tibble [366 × 34] (S3: tbl_df/tbl/data.frame)
##  $ Report.Period                            : chr [1:366] "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" ...
##  $ Transaction.Volume..Gallons.             : num [1:366] 438352 184378 141337 43593 225908 ...
##  $ Unit.Price....Gallon.                    : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.        : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                              : chr [1:366] "CIF" "FOB" "CIF" "CIF" ...
##  $ Crude.Oil.Purchase.Price....Barrel.      : num [1:366] 78.8 67 88.1 70.9 64.7 ...
##  $ Refining.Cost....Gallon.                 : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
##  $ Transport.Cost....Gallon.                : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
##  $ Storage.Cost....Gallon.                  : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
##  $ Pipeline.Tariffs....Gallon.              : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
##  $ Terminal.Fees....Gallon.                 : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
##  $ Operating.Expenses....Gallon.            : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
##  $ Depreciation.Costs....Gallon.            : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
##  $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
##  $ Market.Benchmark....Gallon.              : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
##  $ Profit.Margin....                        : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
##  $ Risk.Adjustments....Gallon.              : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
##  $ Net.Profit.Margin....                    : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
##  $ Return.on.Assets....                     : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
##  $ Competitor.Price....Gallon.              : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
##  $ Regional.Supply.Demand.Trends            : chr [1:366] "Stable" "Stable" "Increasing Demand" "Stable" ...
##  $ Alternative.Energy.Impact                : chr [1:366] "High" "Moderate" "Moderate" "Moderate" ...
##  $ Excise.Tax....Gallon.                    : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
##  $ Value.Added.Tax....                      : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
##  $ Carbon.Emission.Tax....Gallon.           : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
##  $ Market.Price.Volatility....              : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
##  $ Exchange.Rate.Risk....                   : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                        : chr [1:366] "High" "High" "Low" "Medium" ...
##  $ Refinery.Utilization.Rate....            : num [1:366] 80 91.3 91.3 86.3 90.8 ...
##  $ Processing.Capacity..Barrels.Day.        : num [1:366] 138129 355755 397510 174968 317043 ...
##  $ Yield.per.Barrel..Gallons.               : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
##  $ Total.Transaction.Value....              : num [1:366] 1652587 731981 450865 169141 865228 ...
##  $ Refinery.Gross.Margin....Gallon.         : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
##  $ Break.even.Price....Gallon.              : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
#chagne chr to factor for modeleling 
# Convert selected character columns to factor
price <- price %>%
  mutate(
    Trade.Terms = as.factor(Trade.Terms),
    Regional.Supply.Demand.Trends = as.factor(Regional.Supply.Demand.Trends),
    Alternative.Energy.Impact = as.factor(Alternative.Energy.Impact),
    Geopolitical.Risk = as.factor(Geopolitical.Risk)
  )

# Convert to date format
price$Report.Period <- as.Date(price$Report.Period, format="%Y-%m-%d")
str(price$Report.Period)
##  Date[1:366], format: "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" "2024-01-05" ...
str(price)
## tibble [366 × 34] (S3: tbl_df/tbl/data.frame)
##  $ Report.Period                            : Date[1:366], format: "2024-01-01" "2024-01-02" ...
##  $ Transaction.Volume..Gallons.             : num [1:366] 438352 184378 141337 43593 225908 ...
##  $ Unit.Price....Gallon.                    : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.        : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                              : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
##  $ Crude.Oil.Purchase.Price....Barrel.      : num [1:366] 78.8 67 88.1 70.9 64.7 ...
##  $ Refining.Cost....Gallon.                 : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
##  $ Transport.Cost....Gallon.                : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
##  $ Storage.Cost....Gallon.                  : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
##  $ Pipeline.Tariffs....Gallon.              : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
##  $ Terminal.Fees....Gallon.                 : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
##  $ Operating.Expenses....Gallon.            : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
##  $ Depreciation.Costs....Gallon.            : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
##  $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
##  $ Market.Benchmark....Gallon.              : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
##  $ Profit.Margin....                        : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
##  $ Risk.Adjustments....Gallon.              : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
##  $ Net.Profit.Margin....                    : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
##  $ Return.on.Assets....                     : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
##  $ Competitor.Price....Gallon.              : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
##  $ Regional.Supply.Demand.Trends            : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
##  $ Alternative.Energy.Impact                : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
##  $ Excise.Tax....Gallon.                    : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
##  $ Value.Added.Tax....                      : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
##  $ Carbon.Emission.Tax....Gallon.           : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
##  $ Market.Price.Volatility....              : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
##  $ Exchange.Rate.Risk....                   : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                        : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
##  $ Refinery.Utilization.Rate....            : num [1:366] 80 91.3 91.3 86.3 90.8 ...
##  $ Processing.Capacity..Barrels.Day.        : num [1:366] 138129 355755 397510 174968 317043 ...
##  $ Yield.per.Barrel..Gallons.               : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
##  $ Total.Transaction.Value....              : num [1:366] 1652587 731981 450865 169141 865228 ...
##  $ Refinery.Gross.Margin....Gallon.         : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
##  $ Break.even.Price....Gallon.              : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
#data summary
summary(price)
##  Report.Period        Transaction.Volume..Gallons. Unit.Price....Gallon.
##  Min.   :2024-01-01   Min.   : 11043               Min.   :2.500        
##  1st Qu.:2024-04-01   1st Qu.:132563               1st Qu.:2.890        
##  Median :2024-07-01   Median :238005               Median :3.285        
##  Mean   :2024-07-01   Mean   :246641               Mean   :3.257        
##  3rd Qu.:2024-09-30   3rd Qu.:366264               3rd Qu.:3.600        
##  Max.   :2024-12-31   Max.   :497593               Max.   :4.000        
##  Market.Reference.Price....Gallon. Trade.Terms
##  Min.   :2.510                     CIF:131    
##  1st Qu.:2.973                     DAP:103    
##  Median :3.395                     FOB:132    
##  Mean   :3.369                                
##  3rd Qu.:3.780                                
##  Max.   :4.200                                
##  Crude.Oil.Purchase.Price....Barrel. Refining.Cost....Gallon.
##  Min.   :60.12                       Min.   :0.2000          
##  1st Qu.:66.25                       1st Qu.:0.2700          
##  Median :74.02                       Median :0.3400          
##  Mean   :74.39                       Mean   :0.3434          
##  3rd Qu.:82.21                       3rd Qu.:0.4200          
##  Max.   :89.88                       Max.   :0.5000          
##  Transport.Cost....Gallon. Storage.Cost....Gallon. Pipeline.Tariffs....Gallon.
##  Min.   :0.0500            Min.   :0.02000         Min.   :0.01000            
##  1st Qu.:0.0800            1st Qu.:0.04000         1st Qu.:0.02000            
##  Median :0.1000            Median :0.05000         Median :0.03000            
##  Mean   :0.1014            Mean   :0.04959         Mean   :0.03011            
##  3rd Qu.:0.1300            3rd Qu.:0.06000         3rd Qu.:0.04000            
##  Max.   :0.1500            Max.   :0.08000         Max.   :0.05000            
##  Terminal.Fees....Gallon. Operating.Expenses....Gallon.
##  Min.   :0.02000          Min.   :0.1000               
##  1st Qu.:0.03000          1st Qu.:0.1500               
##  Median :0.04000          Median :0.2100               
##  Mean   :0.03975          Mean   :0.2015               
##  3rd Qu.:0.05000          3rd Qu.:0.2500               
##  Max.   :0.06000          Max.   :0.3000               
##  Depreciation.Costs....Gallon. Environmental.Compliance.Costs....Gallon.
##  Min.   :0.0500                Min.   :0.01000                          
##  1st Qu.:0.0800                1st Qu.:0.02000                          
##  Median :0.1200                Median :0.03000                          
##  Mean   :0.1240                Mean   :0.03005                          
##  3rd Qu.:0.1675                3rd Qu.:0.04000                          
##  Max.   :0.2000                Max.   :0.05000                          
##  Market.Benchmark....Gallon. Profit.Margin.... Risk.Adjustments....Gallon.
##  Min.   :2.600               Min.   : 5.040    Min.   :0.01000            
##  1st Qu.:2.940               1st Qu.: 7.293    1st Qu.:0.03000            
##  Median :3.285               Median : 9.575    Median :0.05000            
##  Mean   :3.284               Mean   : 9.783    Mean   :0.05413            
##  3rd Qu.:3.660               3rd Qu.:11.985    3rd Qu.:0.08000            
##  Max.   :4.000               Max.   :14.950    Max.   :0.10000            
##  Net.Profit.Margin.... Return.on.Assets.... Competitor.Price....Gallon.
##  Min.   :3.020         Min.   :2.010        Min.   :2.510              
##  1st Qu.:4.825         1st Qu.:3.442        1st Qu.:2.960              
##  Median :6.590         Median :4.940        Median :3.340              
##  Mean   :6.489         Mean   :4.949        Mean   :3.333              
##  3rd Qu.:8.137         3rd Qu.:6.410        3rd Qu.:3.720              
##  Max.   :9.960         Max.   :7.990        Max.   :4.090              
##    Regional.Supply.Demand.Trends Alternative.Energy.Impact
##  Decreasing Demand:118           High    :117             
##  Increasing Demand:124           Low     :123             
##  Stable           :124           Moderate:126             
##                                                           
##                                                           
##                                                           
##  Excise.Tax....Gallon. Value.Added.Tax.... Carbon.Emission.Tax....Gallon.
##  Min.   :0.050         Min.   : 5.070      Min.   :0.01000               
##  1st Qu.:0.090         1st Qu.: 9.367      1st Qu.:0.03000               
##  Median :0.120         Median :13.160      Median :0.05000               
##  Mean   :0.123         Mean   :12.759      Mean   :0.05434               
##  3rd Qu.:0.160         3rd Qu.:16.137      3rd Qu.:0.08000               
##  Max.   :0.200         Max.   :19.900      Max.   :0.10000               
##  Market.Price.Volatility.... Exchange.Rate.Risk.... Geopolitical.Risk
##  Min.   :1.040               Min.   :0.510          High  :123       
##  1st Qu.:3.103               1st Qu.:1.512          Low   :113       
##  Median :5.110               Median :2.660          Medium:130       
##  Mean   :5.416               Mean   :2.648                           
##  3rd Qu.:7.577               3rd Qu.:3.690                           
##  Max.   :9.980               Max.   :5.000                           
##  Refinery.Utilization.Rate.... Processing.Capacity..Barrels.Day.
##  Min.   :75.03                 Min.   :100399                   
##  1st Qu.:80.32                 1st Qu.:197118                   
##  Median :85.64                 Median :297231                   
##  Mean   :85.37                 Mean   :295243                   
##  3rd Qu.:90.46                 3rd Qu.:392937                   
##  Max.   :94.98                 Max.   :499672                   
##  Yield.per.Barrel..Gallons. Total.Transaction.Value....
##  Min.   :30.0               Min.   :  31141            
##  1st Qu.:33.0               1st Qu.: 421065            
##  Median :36.0               Median : 764578            
##  Mean   :35.5               Mean   : 805110            
##  3rd Qu.:39.0               3rd Qu.:1202052            
##  Max.   :41.0               Max.   :1946554            
##  Refinery.Gross.Margin....Gallon. Break.even.Price....Gallon.
##  Min.   :2.080                    Min.   :0.2500             
##  1st Qu.:2.603                    1st Qu.:0.3700             
##  Median :3.065                    Median :0.4400             
##  Mean   :3.025                    Mean   :0.4449             
##  3rd Qu.:3.428                    3rd Qu.:0.5200             
##  Max.   :3.970                    Max.   :0.6500
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.factor), drop = FALSE]
numeric <- numeric %>%
  rename_with(~ str_wrap(.x, width = 20))  

numeric %>%
  gather(key = "Variable", value = "Value") %>%
  ggplot(aes(x = Value)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
  geom_density(color = "blue", linewidth = 0.5) +
  facet_wrap(~Variable, scales = "free") +
  theme_minimal() +
   theme(strip.text = element_text(size = 6)) +
  ggtitle("Histogram and Density Plot of Numeric Variables")

# format the plot
numeric <- numeric %>%
  rename_with(~ stringr::str_wrap(.x, width = 20))

numeric <- numeric %>%
  gather(key = "Variable", value = "Value")

#  Q-Q Plot
ggplot(numeric, aes(sample = Value)) +
  stat_qq() +
  stat_qq_line(color = "red") +  
  facet_wrap(~Variable, scales = "free") + 
  theme_minimal() +
  theme(strip.text = element_text(size = 6)) +
  ggtitle("Q-Q Plot of Numeric Variables")

# Select all numeric variables
num_vars <- price %>% dplyr::select(where(is.numeric))

# Perform Shapiro-Wilk test for all numeric variables
shapiro_results <- sapply(num_vars, function(x) shapiro.test(x)$p.value)

# Filter out non-normally distributed variables (p < 0.05 means significantly deviating from normality)
non_normal_vars <- names(shapiro_results[shapiro_results < 0.05])

# Display variables that are not normally distributed
print(non_normal_vars)
##  [1] "Transaction.Volume..Gallons."             
##  [2] "Unit.Price....Gallon."                    
##  [3] "Market.Reference.Price....Gallon."        
##  [4] "Crude.Oil.Purchase.Price....Barrel."      
##  [5] "Refining.Cost....Gallon."                 
##  [6] "Transport.Cost....Gallon."                
##  [7] "Storage.Cost....Gallon."                  
##  [8] "Pipeline.Tariffs....Gallon."              
##  [9] "Terminal.Fees....Gallon."                 
## [10] "Operating.Expenses....Gallon."            
## [11] "Depreciation.Costs....Gallon."            
## [12] "Environmental.Compliance.Costs....Gallon."
## [13] "Market.Benchmark....Gallon."              
## [14] "Profit.Margin...."                        
## [15] "Risk.Adjustments....Gallon."              
## [16] "Net.Profit.Margin...."                    
## [17] "Return.on.Assets...."                     
## [18] "Competitor.Price....Gallon."              
## [19] "Excise.Tax....Gallon."                    
## [20] "Value.Added.Tax...."                      
## [21] "Carbon.Emission.Tax....Gallon."           
## [22] "Market.Price.Volatility...."              
## [23] "Exchange.Rate.Risk...."                   
## [24] "Refinery.Utilization.Rate...."            
## [25] "Processing.Capacity..Barrels.Day."        
## [26] "Yield.per.Barrel..Gallons."               
## [27] "Total.Transaction.Value...."              
## [28] "Refinery.Gross.Margin....Gallon."         
## [29] "Break.even.Price....Gallon."
num_vars <- price  %>% dplyr::select(where(is.numeric))

#  Shapiro-Wilk  W > 0.97 normal distribution. W < 0.95 need log transferring 
shapiro_w_values <- sapply(num_vars, function(x) shapiro.test(x)$statistic)
print(shapiro_w_values)
##              Transaction.Volume..Gallons..W 
##                                   0.9564545 
##                     Unit.Price....Gallon..W 
##                                   0.9608380 
##         Market.Reference.Price....Gallon..W 
##                                   0.9572767 
##       Crude.Oil.Purchase.Price....Barrel..W 
##                                   0.9446090 
##                  Refining.Cost....Gallon..W 
##                                   0.9517145 
##                 Transport.Cost....Gallon..W 
##                                   0.9456603 
##                   Storage.Cost....Gallon..W 
##                                   0.9401415 
##               Pipeline.Tariffs....Gallon..W 
##                                   0.9124105 
##                  Terminal.Fees....Gallon..W 
##                                   0.9055461 
##             Operating.Expenses....Gallon..W 
##                                   0.9527556 
##             Depreciation.Costs....Gallon..W 
##                                   0.9402306 
## Environmental.Compliance.Costs....Gallon..W 
##                                   0.9126770 
##               Market.Benchmark....Gallon..W 
##                                   0.9466619 
##                         Profit.Margin.....W 
##                                   0.9556320 
##               Risk.Adjustments....Gallon..W 
##                                   0.9290836 
##                     Net.Profit.Margin.....W 
##                                   0.9610444 
##                      Return.on.Assets.....W 
##                                   0.9550355 
##               Competitor.Price....Gallon..W 
##                                   0.9610444 
##                     Excise.Tax....Gallon..W 
##                                   0.9534736 
##                       Value.Added.Tax.....W 
##                                   0.9614095 
##            Carbon.Emission.Tax....Gallon..W 
##                                   0.9489732 
##               Market.Price.Volatility.....W 
##                                   0.9507635 
##                    Exchange.Rate.Risk.....W 
##                                   0.9543719 
##             Refinery.Utilization.Rate.....W 
##                                   0.9527396 
##         Processing.Capacity..Barrels.Day..W 
##                                   0.9553617 
##                Yield.per.Barrel..Gallons..W 
##                                   0.9371891 
##               Total.Transaction.Value.....W 
##                                   0.9627231 
##          Refinery.Gross.Margin....Gallon..W 
##                                   0.9649656 
##               Break.even.Price....Gallon..W 
##                                   0.9777192
num_vars <- price %>% select(where(is.numeric))
skewness_values <- sapply(num_vars, skewness)
print(skewness_values)
##              Transaction.Volume..Gallons. 
##                               0.121901357 
##                     Unit.Price....Gallon. 
##                              -0.027283921 
##         Market.Reference.Price....Gallon. 
##                              -0.117339326 
##       Crude.Oil.Purchase.Price....Barrel. 
##                               0.113995899 
##                  Refining.Cost....Gallon. 
##                               0.120508662 
##                 Transport.Cost....Gallon. 
##                              -0.067292853 
##                   Storage.Cost....Gallon. 
##                               0.132075735 
##               Pipeline.Tariffs....Gallon. 
##                               0.014786441 
##                  Terminal.Fees....Gallon. 
##                               0.085572735 
##             Operating.Expenses....Gallon. 
##                              -0.047390165 
##             Depreciation.Costs....Gallon. 
##                               0.054832943 
## Environmental.Compliance.Costs....Gallon. 
##                              -0.029533925 
##               Market.Benchmark....Gallon. 
##                               0.036954080 
##                         Profit.Margin.... 
##                               0.143909381 
##               Risk.Adjustments....Gallon. 
##                               0.040637845 
##                     Net.Profit.Margin.... 
##                              -0.005319028 
##                      Return.on.Assets.... 
##                               0.049559175 
##               Competitor.Price....Gallon. 
##                              -0.058777167 
##                     Excise.Tax....Gallon. 
##                               0.126985910 
##                       Value.Added.Tax.... 
##                              -0.141642001 
##            Carbon.Emission.Tax....Gallon. 
##                               0.054645602 
##               Market.Price.Volatility.... 
##                               0.104497522 
##                    Exchange.Rate.Risk.... 
##                               0.110644036 
##             Refinery.Utilization.Rate.... 
##                              -0.061608004 
##         Processing.Capacity..Barrels.Day. 
##                              -0.042291028 
##                Yield.per.Barrel..Gallons. 
##                              -0.058149776 
##               Total.Transaction.Value.... 
##                               0.310931782 
##          Refinery.Gross.Margin....Gallon. 
##                              -0.117458363 
##               Break.even.Price....Gallon. 
##                               0.081942787
# look for num outliers
numeric_vars <- price %>% dplyr::select(where(is.numeric))

outliers <- lapply(numeric_vars, function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)
  IQR <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  which(x < lower_bound | x > upper_bound)  
})

outliers
## $Transaction.Volume..Gallons.
## integer(0)
## 
## $Unit.Price....Gallon.
## integer(0)
## 
## $Market.Reference.Price....Gallon.
## integer(0)
## 
## $Crude.Oil.Purchase.Price....Barrel.
## integer(0)
## 
## $Refining.Cost....Gallon.
## integer(0)
## 
## $Transport.Cost....Gallon.
## integer(0)
## 
## $Storage.Cost....Gallon.
## integer(0)
## 
## $Pipeline.Tariffs....Gallon.
## integer(0)
## 
## $Terminal.Fees....Gallon.
## integer(0)
## 
## $Operating.Expenses....Gallon.
## integer(0)
## 
## $Depreciation.Costs....Gallon.
## integer(0)
## 
## $Environmental.Compliance.Costs....Gallon.
## integer(0)
## 
## $Market.Benchmark....Gallon.
## integer(0)
## 
## $Profit.Margin....
## integer(0)
## 
## $Risk.Adjustments....Gallon.
## integer(0)
## 
## $Net.Profit.Margin....
## integer(0)
## 
## $Return.on.Assets....
## integer(0)
## 
## $Competitor.Price....Gallon.
## integer(0)
## 
## $Excise.Tax....Gallon.
## integer(0)
## 
## $Value.Added.Tax....
## integer(0)
## 
## $Carbon.Emission.Tax....Gallon.
## integer(0)
## 
## $Market.Price.Volatility....
## integer(0)
## 
## $Exchange.Rate.Risk....
## integer(0)
## 
## $Refinery.Utilization.Rate....
## integer(0)
## 
## $Processing.Capacity..Barrels.Day.
## integer(0)
## 
## $Yield.per.Barrel..Gallons.
## integer(0)
## 
## $Total.Transaction.Value....
## integer(0)
## 
## $Refinery.Gross.Margin....Gallon.
## integer(0)
## 
## $Break.even.Price....Gallon.
## integer(0)

price <- price %>% mutate(across(where(is.numeric), scale))

cor_values <- sapply(price %>% select(where(is.numeric)), function(x) cor(x, price[["Unit.Price....Gallon."]], use = "complete.obs"))

#  |cor| < 0.1 
low_corr_vars <- names(cor_values[abs(cor_values) < 0.1])

print("Variables with Low Absolute Correlation to Target:")
## [1] "Variables with Low Absolute Correlation to Target:"
print(low_corr_vars)
##  [1] "Transaction.Volume..Gallons."             
##  [2] "Crude.Oil.Purchase.Price....Barrel."      
##  [3] "Refining.Cost....Gallon."                 
##  [4] "Transport.Cost....Gallon."                
##  [5] "Storage.Cost....Gallon."                  
##  [6] "Pipeline.Tariffs....Gallon."              
##  [7] "Terminal.Fees....Gallon."                 
##  [8] "Operating.Expenses....Gallon."            
##  [9] "Depreciation.Costs....Gallon."            
## [10] "Environmental.Compliance.Costs....Gallon."
## [11] "Market.Benchmark....Gallon."              
## [12] "Profit.Margin...."                        
## [13] "Risk.Adjustments....Gallon."              
## [14] "Net.Profit.Margin...."                    
## [15] "Return.on.Assets...."                     
## [16] "Competitor.Price....Gallon."              
## [17] "Excise.Tax....Gallon."                    
## [18] "Value.Added.Tax...."                      
## [19] "Carbon.Emission.Tax....Gallon."           
## [20] "Market.Price.Volatility...."              
## [21] "Refinery.Utilization.Rate...."            
## [22] "Processing.Capacity..Barrels.Day."        
## [23] "Break.even.Price....Gallon."
numeric <- price[, sapply(price, is.numeric), drop = FALSE]

# look for the for the correlation between num and y
cor_results <- cor(numeric, use = "complete.obs")  
cor_results["Unit.Price....Gallon.", ]  
##              Transaction.Volume..Gallons. 
##                               0.031694613 
##                     Unit.Price....Gallon. 
##                               1.000000000 
##         Market.Reference.Price....Gallon. 
##                               0.127159227 
##       Crude.Oil.Purchase.Price....Barrel. 
##                               0.019416446 
##                  Refining.Cost....Gallon. 
##                              -0.059677350 
##                 Transport.Cost....Gallon. 
##                               0.032534250 
##                   Storage.Cost....Gallon. 
##                               0.017488733 
##               Pipeline.Tariffs....Gallon. 
##                              -0.002559830 
##                  Terminal.Fees....Gallon. 
##                               0.019706524 
##             Operating.Expenses....Gallon. 
##                               0.047817560 
##             Depreciation.Costs....Gallon. 
##                              -0.050515793 
## Environmental.Compliance.Costs....Gallon. 
##                               0.055733286 
##               Market.Benchmark....Gallon. 
##                              -0.046882739 
##                         Profit.Margin.... 
##                              -0.076025599 
##               Risk.Adjustments....Gallon. 
##                              -0.054379813 
##                     Net.Profit.Margin.... 
##                              -0.078125814 
##                      Return.on.Assets.... 
##                               0.008796997 
##               Competitor.Price....Gallon. 
##                               0.083188138 
##                     Excise.Tax....Gallon. 
##                              -0.037687003 
##                       Value.Added.Tax.... 
##                               0.045761039 
##            Carbon.Emission.Tax....Gallon. 
##                              -0.004259401 
##               Market.Price.Volatility.... 
##                              -0.020275851 
##                    Exchange.Rate.Risk.... 
##                               0.103598985 
##             Refinery.Utilization.Rate.... 
##                               0.048157393 
##         Processing.Capacity..Barrels.Day. 
##                              -0.018505055 
##                Yield.per.Barrel..Gallons. 
##                               0.107334769 
##               Total.Transaction.Value.... 
##                               0.253527157 
##          Refinery.Gross.Margin....Gallon. 
##                               0.136125104 
##               Break.even.Price....Gallon. 
##                              -0.045477075
#cor > 0.5 →positive relation, price go up when ind bigger
#cor < -0.5 → neg relation, price go down when ind bigger
#cor ≈ 0 → no correlation
dim(price)
## [1] 366  34
price <- price %>% select(-all_of(low_corr_vars))
str(price)
## tibble [366 × 11] (S3: tbl_df/tbl/data.frame)
##  $ Report.Period                    : Date[1:366], format: "2024-01-01" "2024-01-02" ...
##  $ Unit.Price....Gallon.            : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.: num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                      : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
##  $ Regional.Supply.Demand.Trends    : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
##  $ Alternative.Energy.Impact        : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
##  $ Exchange.Rate.Risk....           : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
##  $ Yield.per.Barrel..Gallons.       : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
##  $ Total.Transaction.Value....      : num [1:366] 1652587 731981 450865 169141 865228 ...
##  $ Refinery.Gross.Margin....Gallon. : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
dim(price)
## [1] 366  11
#correlation look for correlation between num
numeric <- price[, sapply(price, is.numeric), drop = FALSE]

cor_matrix <- cor(numeric_vars, use = "complete.obs")

#  (|cor| > 0.7)
high_cor_pairs <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)

high_cor_df <- data.frame(
  Variable_1 = rownames(cor_matrix)[high_cor_pairs[, 1]],
  Variable_2 = colnames(cor_matrix)[high_cor_pairs[, 2]],
  Correlation = cor_matrix[high_cor_pairs]
)

high_cor_df <- high_cor_df %>%
  distinct(Variable_1, Variable_2, .keep_all = TRUE)

print(high_cor_df)
##                          Variable_1                        Variable_2
## 1       Total.Transaction.Value....      Transaction.Volume..Gallons.
## 2  Refinery.Gross.Margin....Gallon. Market.Reference.Price....Gallon.
## 3       Break.even.Price....Gallon.          Refining.Cost....Gallon.
## 4      Transaction.Volume..Gallons.       Total.Transaction.Value....
## 5 Market.Reference.Price....Gallon.  Refinery.Gross.Margin....Gallon.
## 6          Refining.Cost....Gallon.       Break.even.Price....Gallon.
##   Correlation
## 1   0.9667413
## 2   0.9841007
## 3   0.9452115
## 4   0.9667413
## 5   0.9841007
## 6   0.9452115
#delete the high correlated
price <- price[, setdiff(names(price), c("Total.Transaction.Value....", 
                                         "Refinery.Gross.Margin....Gallon.", 
                                         "Break.even.Price....Gallon.")), drop = FALSE]
dim(price)
## [1] 366   9
str(price)
## tibble [366 × 9] (S3: tbl_df/tbl/data.frame)
##  $ Report.Period                    : Date[1:366], format: "2024-01-01" "2024-01-02" ...
##  $ Unit.Price....Gallon.            : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
##  $ Market.Reference.Price....Gallon.: num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
##  $ Trade.Terms                      : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
##  $ Regional.Supply.Demand.Trends    : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
##  $ Alternative.Energy.Impact        : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
##  $ Exchange.Rate.Risk....           : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
##  $ Geopolitical.Risk                : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
##  $ Yield.per.Barrel..Gallons.       : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
#build linear regression model
  
set.seed(5678)
train_index <- sample(1:nrow(price), 0.8 * nrow(price))
train_data <- price[train_index, ]
test_data <- price[-train_index, ]


lm_model <- lm(`Unit.Price....Gallon.` ~ ., data = train_data)

train_preds <- predict(lm_model, newdata = train_data)
test_preds <- predict(lm_model, newdata = test_data)

#  MSE
mse_train <- mean((train_data$`Unit.Price....Gallon.` - train_preds)^2)
mse_test <- mean((test_data$`Unit.Price....Gallon.` - test_preds)^2)

#  RMSE
rmse_train <- sqrt(mse_train)
rmse_test <- sqrt(mse_test)

#  R²
r_squared_train <- summary(lm_model)$r.squared
r_squared_test <- 1 - sum((test_preds - test_data$`Unit.Price....Gallon.`)^2) / 
                      sum((test_data$`Unit.Price....Gallon.` - mean(test_data$`Unit.Price....Gallon.`))^2)


cat("📌 ****\n")
## 📌 ****
cat("✅  MSE:", mse_train, "\n")
## ✅  MSE: 0.1733605
cat("✅  MSE:", mse_test, "\n")
## ✅  MSE: 0.1601503
cat("✅  RMSE:", rmse_train, "\n")
## ✅  RMSE: 0.4163658
cat("✅  RMSE:", rmse_test, "\n")
## ✅  RMSE: 0.4001878
cat("✅  R²:", r_squared_train, "\n")
## ✅  R²: 0.04952773
cat("✅  R²:", r_squared_test, "\n")
## ✅  R²: 0.05635562
# predict the price
# 预测 test_data
test_preds <- predict(lm_model, newdata = test_data)

# 将预测结果添加到 test_data
test_data$Predicted_Unit_Price <- test_preds

# 计算 MSE 和 RMSE 评估预测效果
mse_test <- mean((test_data$`Unit.Price....Gallon.` - test_preds)^2)
rmse_test <- sqrt(mse_test)
r_squared_test <- 1 - sum((test_preds - test_data$`Unit.Price....Gallon.`)^2) / 
                      sum((test_data$`Unit.Price....Gallon.` - mean(test_data$`Unit.Price....Gallon.`))^2)

# 打印预测结果
cat("📌 Test Data Prediction Results\n")
## 📌 Test Data Prediction Results
cat("✅ MSE:", mse_test, "\n")
## ✅ MSE: 0.1601503
cat("✅ RMSE:", rmse_test, "\n")
## ✅ RMSE: 0.4001878
cat("✅ R²:", r_squared_test, "\n")
## ✅ R²: 0.05635562
# 显示部分预测结果
head(test_data[, c("Report.Period", "Unit.Price....Gallon.", "Predicted_Unit_Price")])
## # A tibble: 6 × 3
##   Report.Period Unit.Price....Gallon. Predicted_Unit_Price
##   <date>                        <dbl>                <dbl>
## 1 2024-01-07                     2.68                 3.15
## 2 2024-01-08                     3.49                 3.19
## 3 2024-01-16                     3.65                 3.35
## 4 2024-01-21                     2.87                 3.37
## 5 2024-01-24                     3.51                 3.38
## 6 2024-02-02                     3.24                 3.37