library(readxl)
library(tidyverse)
library(caret)
library(ggplot2)
library(dplyr)
library(car)
library(DMwR2)
library(ROSE)
library(corrplot)
library(reshape2)
library(vcd)
library(pROC)
library(knitr)
library(randomForest)
library(xgboost)
library(GGally)
library(ggpubr)
library(nortest)
library(stringr)
library(writexl)
library(moments)
library(Metrics)
file_path <- "C:/Users/yuan1/Downloads/transfer_pricing_daily_data_v2.xlsx"
price<- readxl::read_excel(file_path)
# make the names readable for R
colnames(price) <- make.names(colnames(price))
print(colnames(price))
## [1] "Company.Name"
## [2] "Business.Unit"
## [3] "Report.Period"
## [4] "Currency"
## [5] "Region"
## [6] "Transfer.Pricing.Method"
## [7] "Benchmarking"
## [8] "Transaction.Volume..Gallons."
## [9] "Unit.Price....Gallon."
## [10] "Market.Reference.Price....Gallon."
## [11] "Trade.Terms"
## [12] "Crude.Oil.Purchase.Price....Barrel."
## [13] "Refining.Cost....Gallon."
## [14] "Transport.Cost....Gallon."
## [15] "Storage.Cost....Gallon."
## [16] "Pipeline.Tariffs....Gallon."
## [17] "Terminal.Fees....Gallon."
## [18] "Operating.Expenses....Gallon."
## [19] "Depreciation.Costs....Gallon."
## [20] "Environmental.Compliance.Costs....Gallon."
## [21] "Transfer.Pricing.Formula"
## [22] "Market.Benchmark....Gallon."
## [23] "Profit.Margin...."
## [24] "Risk.Adjustments....Gallon."
## [25] "Net.Profit.Margin...."
## [26] "Return.on.Assets...."
## [27] "Competitor.Price....Gallon."
## [28] "Regional.Supply.Demand.Trends"
## [29] "Alternative.Energy.Impact"
## [30] "Excise.Tax....Gallon."
## [31] "Value.Added.Tax...."
## [32] "Carbon.Emission.Tax....Gallon."
## [33] "Market.Price.Volatility...."
## [34] "Exchange.Rate.Risk...."
## [35] "Geopolitical.Risk"
## [36] "Refinery.Utilization.Rate...."
## [37] "Processing.Capacity..Barrels.Day."
## [38] "Yield.per.Barrel..Gallons."
## [39] "Total.Transaction.Value...."
## [40] "Refinery.Gross.Margin....Gallon."
## [41] "Break.even.Price....Gallon."
## [42] "Product.Name"
# count the varibale has only one level
unique_counts <- sapply(price, function(x) n_distinct(x, na.rm = TRUE))
one_level_vars <- names(unique_counts[unique_counts == 1])
print(one_level_vars)
## [1] "Company.Name" "Business.Unit"
## [3] "Currency" "Region"
## [5] "Transfer.Pricing.Method" "Benchmarking"
## [7] "Transfer.Pricing.Formula" "Product.Name"
price <- price %>% select(-one_level_vars)
print(price)
## # A tibble: 366 × 34
## Report.Period Transaction.Volume..Gallons. Unit.Price....Gallon.
## <chr> <dbl> <dbl>
## 1 2024-01-01 438352 3.77
## 2 2024-01-02 184378 3.97
## 3 2024-01-03 141337 3.19
## 4 2024-01-04 43593 3.88
## 5 2024-01-05 225908 3.83
## 6 2024-01-06 167237 3.49
## 7 2024-01-07 330267 2.68
## 8 2024-01-08 238738 3.49
## 9 2024-01-09 138323 3.65
## 10 2024-01-10 305710 3.64
## # ℹ 356 more rows
## # ℹ 31 more variables: Market.Reference.Price....Gallon. <dbl>,
## # Trade.Terms <chr>, Crude.Oil.Purchase.Price....Barrel. <dbl>,
## # Refining.Cost....Gallon. <dbl>, Transport.Cost....Gallon. <dbl>,
## # Storage.Cost....Gallon. <dbl>, Pipeline.Tariffs....Gallon. <dbl>,
## # Terminal.Fees....Gallon. <dbl>, Operating.Expenses....Gallon. <dbl>,
## # Depreciation.Costs....Gallon. <dbl>, …
# look for missing data
sum(is.na(price))
## [1] 0
#look for duplicate data
sum(duplicated(price))
## [1] 0
str(price)
## tibble [366 × 34] (S3: tbl_df/tbl/data.frame)
## $ Report.Period : chr [1:366] "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" ...
## $ Transaction.Volume..Gallons. : num [1:366] 438352 184378 141337 43593 225908 ...
## $ Unit.Price....Gallon. : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon. : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : chr [1:366] "CIF" "FOB" "CIF" "CIF" ...
## $ Crude.Oil.Purchase.Price....Barrel. : num [1:366] 78.8 67 88.1 70.9 64.7 ...
## $ Refining.Cost....Gallon. : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
## $ Transport.Cost....Gallon. : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
## $ Storage.Cost....Gallon. : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
## $ Pipeline.Tariffs....Gallon. : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
## $ Terminal.Fees....Gallon. : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
## $ Operating.Expenses....Gallon. : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
## $ Depreciation.Costs....Gallon. : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
## $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
## $ Market.Benchmark....Gallon. : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
## $ Profit.Margin.... : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
## $ Risk.Adjustments....Gallon. : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
## $ Net.Profit.Margin.... : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
## $ Return.on.Assets.... : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
## $ Competitor.Price....Gallon. : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
## $ Regional.Supply.Demand.Trends : chr [1:366] "Stable" "Stable" "Increasing Demand" "Stable" ...
## $ Alternative.Energy.Impact : chr [1:366] "High" "Moderate" "Moderate" "Moderate" ...
## $ Excise.Tax....Gallon. : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
## $ Value.Added.Tax.... : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
## $ Carbon.Emission.Tax....Gallon. : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
## $ Market.Price.Volatility.... : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
## $ Exchange.Rate.Risk.... : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : chr [1:366] "High" "High" "Low" "Medium" ...
## $ Refinery.Utilization.Rate.... : num [1:366] 80 91.3 91.3 86.3 90.8 ...
## $ Processing.Capacity..Barrels.Day. : num [1:366] 138129 355755 397510 174968 317043 ...
## $ Yield.per.Barrel..Gallons. : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
## $ Total.Transaction.Value.... : num [1:366] 1652587 731981 450865 169141 865228 ...
## $ Refinery.Gross.Margin....Gallon. : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
## $ Break.even.Price....Gallon. : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
#chagne chr to factor for modeleling
# Convert selected character columns to factor
price <- price %>%
mutate(
Trade.Terms = as.factor(Trade.Terms),
Regional.Supply.Demand.Trends = as.factor(Regional.Supply.Demand.Trends),
Alternative.Energy.Impact = as.factor(Alternative.Energy.Impact),
Geopolitical.Risk = as.factor(Geopolitical.Risk)
)
# Convert to date format
price$Report.Period <- as.Date(price$Report.Period, format="%Y-%m-%d")
str(price$Report.Period)
## Date[1:366], format: "2024-01-01" "2024-01-02" "2024-01-03" "2024-01-04" "2024-01-05" ...
str(price)
## tibble [366 × 34] (S3: tbl_df/tbl/data.frame)
## $ Report.Period : Date[1:366], format: "2024-01-01" "2024-01-02" ...
## $ Transaction.Volume..Gallons. : num [1:366] 438352 184378 141337 43593 225908 ...
## $ Unit.Price....Gallon. : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon. : num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
## $ Crude.Oil.Purchase.Price....Barrel. : num [1:366] 78.8 67 88.1 70.9 64.7 ...
## $ Refining.Cost....Gallon. : num [1:366] 0.39 0.26 0.25 0.2 0.43 0.45 0.38 0.41 0.41 0.35 ...
## $ Transport.Cost....Gallon. : num [1:366] 0.06 0.13 0.11 0.1 0.11 0.15 0.13 0.1 0.12 0.14 ...
## $ Storage.Cost....Gallon. : num [1:366] 0.05 0.07 0.04 0.03 0.05 0.05 0.05 0.07 0.07 0.07 ...
## $ Pipeline.Tariffs....Gallon. : num [1:366] 0.05 0.01 0.01 0.04 0.03 0.04 0.02 0.04 0.01 0.03 ...
## $ Terminal.Fees....Gallon. : num [1:366] 0.06 0.06 0.04 0.04 0.02 0.06 0.03 0.03 0.05 0.03 ...
## $ Operating.Expenses....Gallon. : num [1:366] 0.24 0.19 0.26 0.23 0.25 0.24 0.17 0.12 0.26 0.27 ...
## $ Depreciation.Costs....Gallon. : num [1:366] 0.05 0.06 0.15 0.12 0.11 0.18 0.16 0.1 0.18 0.06 ...
## $ Environmental.Compliance.Costs....Gallon.: num [1:366] 0.03 0.03 0.02 0.04 0.05 0.02 0.05 0.03 0.02 0.01 ...
## $ Market.Benchmark....Gallon. : num [1:366] 3.27 3.85 3.11 2.79 2.87 3.98 2.85 3.14 3.71 2.7 ...
## $ Profit.Margin.... : num [1:366] 11.94 12.13 6.39 13.06 13.81 ...
## $ Risk.Adjustments....Gallon. : num [1:366] 0.09 0.08 0.03 0.09 0.03 0.04 0.02 0.06 0.02 0.02 ...
## $ Net.Profit.Margin.... : num [1:366] 4.93 7.72 5.93 8.01 6.67 8.22 3.75 8.8 7.51 8.52 ...
## $ Return.on.Assets.... : num [1:366] 5.08 5.53 3.4 4.2 2.82 4.37 7.16 2.14 7.76 4.35 ...
## $ Competitor.Price....Gallon. : num [1:366] 3.18 3.27 4 3.47 3.25 3.05 3.58 3.21 2.97 3.85 ...
## $ Regional.Supply.Demand.Trends : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
## $ Alternative.Energy.Impact : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
## $ Excise.Tax....Gallon. : num [1:366] 0.1 0.08 0.06 0.08 0.07 0.14 0.16 0.16 0.06 0.19 ...
## $ Value.Added.Tax.... : num [1:366] 17.6 11.42 9.4 10.62 7.86 ...
## $ Carbon.Emission.Tax....Gallon. : num [1:366] 0.01 0.03 0.09 0.05 0.08 0.05 0.01 0.03 0.1 0.04 ...
## $ Market.Price.Volatility.... : num [1:366] 2.9 4.12 4.09 7.86 2.74 7.2 5.49 1.61 7.4 2.01 ...
## $ Exchange.Rate.Risk.... : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
## $ Refinery.Utilization.Rate.... : num [1:366] 80 91.3 91.3 86.3 90.8 ...
## $ Processing.Capacity..Barrels.Day. : num [1:366] 138129 355755 397510 174968 317043 ...
## $ Yield.per.Barrel..Gallons. : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
## $ Total.Transaction.Value.... : num [1:366] 1652587 731981 450865 169141 865228 ...
## $ Refinery.Gross.Margin....Gallon. : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
## $ Break.even.Price....Gallon. : num [1:366] 0.45 0.39 0.36 0.3 0.54 0.6 0.51 0.51 0.53 0.49 ...
#data summary
summary(price)
## Report.Period Transaction.Volume..Gallons. Unit.Price....Gallon.
## Min. :2024-01-01 Min. : 11043 Min. :2.500
## 1st Qu.:2024-04-01 1st Qu.:132563 1st Qu.:2.890
## Median :2024-07-01 Median :238005 Median :3.285
## Mean :2024-07-01 Mean :246641 Mean :3.257
## 3rd Qu.:2024-09-30 3rd Qu.:366264 3rd Qu.:3.600
## Max. :2024-12-31 Max. :497593 Max. :4.000
## Market.Reference.Price....Gallon. Trade.Terms
## Min. :2.510 CIF:131
## 1st Qu.:2.973 DAP:103
## Median :3.395 FOB:132
## Mean :3.369
## 3rd Qu.:3.780
## Max. :4.200
## Crude.Oil.Purchase.Price....Barrel. Refining.Cost....Gallon.
## Min. :60.12 Min. :0.2000
## 1st Qu.:66.25 1st Qu.:0.2700
## Median :74.02 Median :0.3400
## Mean :74.39 Mean :0.3434
## 3rd Qu.:82.21 3rd Qu.:0.4200
## Max. :89.88 Max. :0.5000
## Transport.Cost....Gallon. Storage.Cost....Gallon. Pipeline.Tariffs....Gallon.
## Min. :0.0500 Min. :0.02000 Min. :0.01000
## 1st Qu.:0.0800 1st Qu.:0.04000 1st Qu.:0.02000
## Median :0.1000 Median :0.05000 Median :0.03000
## Mean :0.1014 Mean :0.04959 Mean :0.03011
## 3rd Qu.:0.1300 3rd Qu.:0.06000 3rd Qu.:0.04000
## Max. :0.1500 Max. :0.08000 Max. :0.05000
## Terminal.Fees....Gallon. Operating.Expenses....Gallon.
## Min. :0.02000 Min. :0.1000
## 1st Qu.:0.03000 1st Qu.:0.1500
## Median :0.04000 Median :0.2100
## Mean :0.03975 Mean :0.2015
## 3rd Qu.:0.05000 3rd Qu.:0.2500
## Max. :0.06000 Max. :0.3000
## Depreciation.Costs....Gallon. Environmental.Compliance.Costs....Gallon.
## Min. :0.0500 Min. :0.01000
## 1st Qu.:0.0800 1st Qu.:0.02000
## Median :0.1200 Median :0.03000
## Mean :0.1240 Mean :0.03005
## 3rd Qu.:0.1675 3rd Qu.:0.04000
## Max. :0.2000 Max. :0.05000
## Market.Benchmark....Gallon. Profit.Margin.... Risk.Adjustments....Gallon.
## Min. :2.600 Min. : 5.040 Min. :0.01000
## 1st Qu.:2.940 1st Qu.: 7.293 1st Qu.:0.03000
## Median :3.285 Median : 9.575 Median :0.05000
## Mean :3.284 Mean : 9.783 Mean :0.05413
## 3rd Qu.:3.660 3rd Qu.:11.985 3rd Qu.:0.08000
## Max. :4.000 Max. :14.950 Max. :0.10000
## Net.Profit.Margin.... Return.on.Assets.... Competitor.Price....Gallon.
## Min. :3.020 Min. :2.010 Min. :2.510
## 1st Qu.:4.825 1st Qu.:3.442 1st Qu.:2.960
## Median :6.590 Median :4.940 Median :3.340
## Mean :6.489 Mean :4.949 Mean :3.333
## 3rd Qu.:8.137 3rd Qu.:6.410 3rd Qu.:3.720
## Max. :9.960 Max. :7.990 Max. :4.090
## Regional.Supply.Demand.Trends Alternative.Energy.Impact
## Decreasing Demand:118 High :117
## Increasing Demand:124 Low :123
## Stable :124 Moderate:126
##
##
##
## Excise.Tax....Gallon. Value.Added.Tax.... Carbon.Emission.Tax....Gallon.
## Min. :0.050 Min. : 5.070 Min. :0.01000
## 1st Qu.:0.090 1st Qu.: 9.367 1st Qu.:0.03000
## Median :0.120 Median :13.160 Median :0.05000
## Mean :0.123 Mean :12.759 Mean :0.05434
## 3rd Qu.:0.160 3rd Qu.:16.137 3rd Qu.:0.08000
## Max. :0.200 Max. :19.900 Max. :0.10000
## Market.Price.Volatility.... Exchange.Rate.Risk.... Geopolitical.Risk
## Min. :1.040 Min. :0.510 High :123
## 1st Qu.:3.103 1st Qu.:1.512 Low :113
## Median :5.110 Median :2.660 Medium:130
## Mean :5.416 Mean :2.648
## 3rd Qu.:7.577 3rd Qu.:3.690
## Max. :9.980 Max. :5.000
## Refinery.Utilization.Rate.... Processing.Capacity..Barrels.Day.
## Min. :75.03 Min. :100399
## 1st Qu.:80.32 1st Qu.:197118
## Median :85.64 Median :297231
## Mean :85.37 Mean :295243
## 3rd Qu.:90.46 3rd Qu.:392937
## Max. :94.98 Max. :499672
## Yield.per.Barrel..Gallons. Total.Transaction.Value....
## Min. :30.0 Min. : 31141
## 1st Qu.:33.0 1st Qu.: 421065
## Median :36.0 Median : 764578
## Mean :35.5 Mean : 805110
## 3rd Qu.:39.0 3rd Qu.:1202052
## Max. :41.0 Max. :1946554
## Refinery.Gross.Margin....Gallon. Break.even.Price....Gallon.
## Min. :2.080 Min. :0.2500
## 1st Qu.:2.603 1st Qu.:0.3700
## Median :3.065 Median :0.4400
## Mean :3.025 Mean :0.4449
## 3rd Qu.:3.428 3rd Qu.:0.5200
## Max. :3.970 Max. :0.6500
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
factor <- price[, sapply(price, is.factor), drop = FALSE]
numeric <- numeric %>%
rename_with(~ str_wrap(.x, width = 20))
numeric %>%
gather(key = "Variable", value = "Value") %>%
ggplot(aes(x = Value)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "lightblue", color = "black") +
geom_density(color = "blue", linewidth = 0.5) +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
theme(strip.text = element_text(size = 6)) +
ggtitle("Histogram and Density Plot of Numeric Variables")
# format the plot
numeric <- numeric %>%
rename_with(~ stringr::str_wrap(.x, width = 20))
numeric <- numeric %>%
gather(key = "Variable", value = "Value")
# Q-Q Plot
ggplot(numeric, aes(sample = Value)) +
stat_qq() +
stat_qq_line(color = "red") +
facet_wrap(~Variable, scales = "free") +
theme_minimal() +
theme(strip.text = element_text(size = 6)) +
ggtitle("Q-Q Plot of Numeric Variables")
# Select all numeric variables
num_vars <- price %>% dplyr::select(where(is.numeric))
# Perform Shapiro-Wilk test for all numeric variables
shapiro_results <- sapply(num_vars, function(x) shapiro.test(x)$p.value)
# Filter out non-normally distributed variables (p < 0.05 means significantly deviating from normality)
non_normal_vars <- names(shapiro_results[shapiro_results < 0.05])
# Display variables that are not normally distributed
print(non_normal_vars)
## [1] "Transaction.Volume..Gallons."
## [2] "Unit.Price....Gallon."
## [3] "Market.Reference.Price....Gallon."
## [4] "Crude.Oil.Purchase.Price....Barrel."
## [5] "Refining.Cost....Gallon."
## [6] "Transport.Cost....Gallon."
## [7] "Storage.Cost....Gallon."
## [8] "Pipeline.Tariffs....Gallon."
## [9] "Terminal.Fees....Gallon."
## [10] "Operating.Expenses....Gallon."
## [11] "Depreciation.Costs....Gallon."
## [12] "Environmental.Compliance.Costs....Gallon."
## [13] "Market.Benchmark....Gallon."
## [14] "Profit.Margin...."
## [15] "Risk.Adjustments....Gallon."
## [16] "Net.Profit.Margin...."
## [17] "Return.on.Assets...."
## [18] "Competitor.Price....Gallon."
## [19] "Excise.Tax....Gallon."
## [20] "Value.Added.Tax...."
## [21] "Carbon.Emission.Tax....Gallon."
## [22] "Market.Price.Volatility...."
## [23] "Exchange.Rate.Risk...."
## [24] "Refinery.Utilization.Rate...."
## [25] "Processing.Capacity..Barrels.Day."
## [26] "Yield.per.Barrel..Gallons."
## [27] "Total.Transaction.Value...."
## [28] "Refinery.Gross.Margin....Gallon."
## [29] "Break.even.Price....Gallon."
num_vars <- price %>% dplyr::select(where(is.numeric))
# Shapiro-Wilk W > 0.97 normal distribution. W < 0.95 need log transferring
shapiro_w_values <- sapply(num_vars, function(x) shapiro.test(x)$statistic)
print(shapiro_w_values)
## Transaction.Volume..Gallons..W
## 0.9564545
## Unit.Price....Gallon..W
## 0.9608380
## Market.Reference.Price....Gallon..W
## 0.9572767
## Crude.Oil.Purchase.Price....Barrel..W
## 0.9446090
## Refining.Cost....Gallon..W
## 0.9517145
## Transport.Cost....Gallon..W
## 0.9456603
## Storage.Cost....Gallon..W
## 0.9401415
## Pipeline.Tariffs....Gallon..W
## 0.9124105
## Terminal.Fees....Gallon..W
## 0.9055461
## Operating.Expenses....Gallon..W
## 0.9527556
## Depreciation.Costs....Gallon..W
## 0.9402306
## Environmental.Compliance.Costs....Gallon..W
## 0.9126770
## Market.Benchmark....Gallon..W
## 0.9466619
## Profit.Margin.....W
## 0.9556320
## Risk.Adjustments....Gallon..W
## 0.9290836
## Net.Profit.Margin.....W
## 0.9610444
## Return.on.Assets.....W
## 0.9550355
## Competitor.Price....Gallon..W
## 0.9610444
## Excise.Tax....Gallon..W
## 0.9534736
## Value.Added.Tax.....W
## 0.9614095
## Carbon.Emission.Tax....Gallon..W
## 0.9489732
## Market.Price.Volatility.....W
## 0.9507635
## Exchange.Rate.Risk.....W
## 0.9543719
## Refinery.Utilization.Rate.....W
## 0.9527396
## Processing.Capacity..Barrels.Day..W
## 0.9553617
## Yield.per.Barrel..Gallons..W
## 0.9371891
## Total.Transaction.Value.....W
## 0.9627231
## Refinery.Gross.Margin....Gallon..W
## 0.9649656
## Break.even.Price....Gallon..W
## 0.9777192
num_vars <- price %>% select(where(is.numeric))
skewness_values <- sapply(num_vars, skewness)
print(skewness_values)
## Transaction.Volume..Gallons.
## 0.121901357
## Unit.Price....Gallon.
## -0.027283921
## Market.Reference.Price....Gallon.
## -0.117339326
## Crude.Oil.Purchase.Price....Barrel.
## 0.113995899
## Refining.Cost....Gallon.
## 0.120508662
## Transport.Cost....Gallon.
## -0.067292853
## Storage.Cost....Gallon.
## 0.132075735
## Pipeline.Tariffs....Gallon.
## 0.014786441
## Terminal.Fees....Gallon.
## 0.085572735
## Operating.Expenses....Gallon.
## -0.047390165
## Depreciation.Costs....Gallon.
## 0.054832943
## Environmental.Compliance.Costs....Gallon.
## -0.029533925
## Market.Benchmark....Gallon.
## 0.036954080
## Profit.Margin....
## 0.143909381
## Risk.Adjustments....Gallon.
## 0.040637845
## Net.Profit.Margin....
## -0.005319028
## Return.on.Assets....
## 0.049559175
## Competitor.Price....Gallon.
## -0.058777167
## Excise.Tax....Gallon.
## 0.126985910
## Value.Added.Tax....
## -0.141642001
## Carbon.Emission.Tax....Gallon.
## 0.054645602
## Market.Price.Volatility....
## 0.104497522
## Exchange.Rate.Risk....
## 0.110644036
## Refinery.Utilization.Rate....
## -0.061608004
## Processing.Capacity..Barrels.Day.
## -0.042291028
## Yield.per.Barrel..Gallons.
## -0.058149776
## Total.Transaction.Value....
## 0.310931782
## Refinery.Gross.Margin....Gallon.
## -0.117458363
## Break.even.Price....Gallon.
## 0.081942787
# look for num outliers
numeric_vars <- price %>% dplyr::select(where(is.numeric))
outliers <- lapply(numeric_vars, function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
which(x < lower_bound | x > upper_bound)
})
outliers
## $Transaction.Volume..Gallons.
## integer(0)
##
## $Unit.Price....Gallon.
## integer(0)
##
## $Market.Reference.Price....Gallon.
## integer(0)
##
## $Crude.Oil.Purchase.Price....Barrel.
## integer(0)
##
## $Refining.Cost....Gallon.
## integer(0)
##
## $Transport.Cost....Gallon.
## integer(0)
##
## $Storage.Cost....Gallon.
## integer(0)
##
## $Pipeline.Tariffs....Gallon.
## integer(0)
##
## $Terminal.Fees....Gallon.
## integer(0)
##
## $Operating.Expenses....Gallon.
## integer(0)
##
## $Depreciation.Costs....Gallon.
## integer(0)
##
## $Environmental.Compliance.Costs....Gallon.
## integer(0)
##
## $Market.Benchmark....Gallon.
## integer(0)
##
## $Profit.Margin....
## integer(0)
##
## $Risk.Adjustments....Gallon.
## integer(0)
##
## $Net.Profit.Margin....
## integer(0)
##
## $Return.on.Assets....
## integer(0)
##
## $Competitor.Price....Gallon.
## integer(0)
##
## $Excise.Tax....Gallon.
## integer(0)
##
## $Value.Added.Tax....
## integer(0)
##
## $Carbon.Emission.Tax....Gallon.
## integer(0)
##
## $Market.Price.Volatility....
## integer(0)
##
## $Exchange.Rate.Risk....
## integer(0)
##
## $Refinery.Utilization.Rate....
## integer(0)
##
## $Processing.Capacity..Barrels.Day.
## integer(0)
##
## $Yield.per.Barrel..Gallons.
## integer(0)
##
## $Total.Transaction.Value....
## integer(0)
##
## $Refinery.Gross.Margin....Gallon.
## integer(0)
##
## $Break.even.Price....Gallon.
## integer(0)
price <- price %>% mutate(across(where(is.numeric), scale))
cor_values <- sapply(price %>% select(where(is.numeric)), function(x) cor(x, price[["Unit.Price....Gallon."]], use = "complete.obs"))
# |cor| < 0.1
low_corr_vars <- names(cor_values[abs(cor_values) < 0.1])
print("Variables with Low Absolute Correlation to Target:")
## [1] "Variables with Low Absolute Correlation to Target:"
print(low_corr_vars)
## [1] "Transaction.Volume..Gallons."
## [2] "Crude.Oil.Purchase.Price....Barrel."
## [3] "Refining.Cost....Gallon."
## [4] "Transport.Cost....Gallon."
## [5] "Storage.Cost....Gallon."
## [6] "Pipeline.Tariffs....Gallon."
## [7] "Terminal.Fees....Gallon."
## [8] "Operating.Expenses....Gallon."
## [9] "Depreciation.Costs....Gallon."
## [10] "Environmental.Compliance.Costs....Gallon."
## [11] "Market.Benchmark....Gallon."
## [12] "Profit.Margin...."
## [13] "Risk.Adjustments....Gallon."
## [14] "Net.Profit.Margin...."
## [15] "Return.on.Assets...."
## [16] "Competitor.Price....Gallon."
## [17] "Excise.Tax....Gallon."
## [18] "Value.Added.Tax...."
## [19] "Carbon.Emission.Tax....Gallon."
## [20] "Market.Price.Volatility...."
## [21] "Refinery.Utilization.Rate...."
## [22] "Processing.Capacity..Barrels.Day."
## [23] "Break.even.Price....Gallon."
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
# look for the for the correlation between num and y
cor_results <- cor(numeric, use = "complete.obs")
cor_results["Unit.Price....Gallon.", ]
## Transaction.Volume..Gallons.
## 0.031694613
## Unit.Price....Gallon.
## 1.000000000
## Market.Reference.Price....Gallon.
## 0.127159227
## Crude.Oil.Purchase.Price....Barrel.
## 0.019416446
## Refining.Cost....Gallon.
## -0.059677350
## Transport.Cost....Gallon.
## 0.032534250
## Storage.Cost....Gallon.
## 0.017488733
## Pipeline.Tariffs....Gallon.
## -0.002559830
## Terminal.Fees....Gallon.
## 0.019706524
## Operating.Expenses....Gallon.
## 0.047817560
## Depreciation.Costs....Gallon.
## -0.050515793
## Environmental.Compliance.Costs....Gallon.
## 0.055733286
## Market.Benchmark....Gallon.
## -0.046882739
## Profit.Margin....
## -0.076025599
## Risk.Adjustments....Gallon.
## -0.054379813
## Net.Profit.Margin....
## -0.078125814
## Return.on.Assets....
## 0.008796997
## Competitor.Price....Gallon.
## 0.083188138
## Excise.Tax....Gallon.
## -0.037687003
## Value.Added.Tax....
## 0.045761039
## Carbon.Emission.Tax....Gallon.
## -0.004259401
## Market.Price.Volatility....
## -0.020275851
## Exchange.Rate.Risk....
## 0.103598985
## Refinery.Utilization.Rate....
## 0.048157393
## Processing.Capacity..Barrels.Day.
## -0.018505055
## Yield.per.Barrel..Gallons.
## 0.107334769
## Total.Transaction.Value....
## 0.253527157
## Refinery.Gross.Margin....Gallon.
## 0.136125104
## Break.even.Price....Gallon.
## -0.045477075
#cor > 0.5 →positive relation, price go up when ind bigger
#cor < -0.5 → neg relation, price go down when ind bigger
#cor ≈ 0 → no correlation
dim(price)
## [1] 366 34
price <- price %>% select(-all_of(low_corr_vars))
str(price)
## tibble [366 × 11] (S3: tbl_df/tbl/data.frame)
## $ Report.Period : Date[1:366], format: "2024-01-01" "2024-01-02" ...
## $ Unit.Price....Gallon. : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon.: num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
## $ Regional.Supply.Demand.Trends : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
## $ Alternative.Energy.Impact : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
## $ Exchange.Rate.Risk.... : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
## $ Yield.per.Barrel..Gallons. : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
## $ Total.Transaction.Value.... : num [1:366] 1652587 731981 450865 169141 865228 ...
## $ Refinery.Gross.Margin....Gallon. : num [1:366] 2.59 3.47 2.26 3.66 3.24 2.24 2.46 2.86 2.51 3.62 ...
dim(price)
## [1] 366 11
#correlation look for correlation between num
numeric <- price[, sapply(price, is.numeric), drop = FALSE]
cor_matrix <- cor(numeric_vars, use = "complete.obs")
# (|cor| > 0.7)
high_cor_pairs <- which(abs(cor_matrix) > 0.7 & abs(cor_matrix) < 1, arr.ind = TRUE)
high_cor_df <- data.frame(
Variable_1 = rownames(cor_matrix)[high_cor_pairs[, 1]],
Variable_2 = colnames(cor_matrix)[high_cor_pairs[, 2]],
Correlation = cor_matrix[high_cor_pairs]
)
high_cor_df <- high_cor_df %>%
distinct(Variable_1, Variable_2, .keep_all = TRUE)
print(high_cor_df)
## Variable_1 Variable_2
## 1 Total.Transaction.Value.... Transaction.Volume..Gallons.
## 2 Refinery.Gross.Margin....Gallon. Market.Reference.Price....Gallon.
## 3 Break.even.Price....Gallon. Refining.Cost....Gallon.
## 4 Transaction.Volume..Gallons. Total.Transaction.Value....
## 5 Market.Reference.Price....Gallon. Refinery.Gross.Margin....Gallon.
## 6 Refining.Cost....Gallon. Break.even.Price....Gallon.
## Correlation
## 1 0.9667413
## 2 0.9841007
## 3 0.9452115
## 4 0.9667413
## 5 0.9841007
## 6 0.9452115
#delete the high correlated
price <- price[, setdiff(names(price), c("Total.Transaction.Value....",
"Refinery.Gross.Margin....Gallon.",
"Break.even.Price....Gallon.")), drop = FALSE]
dim(price)
## [1] 366 9
str(price)
## tibble [366 × 9] (S3: tbl_df/tbl/data.frame)
## $ Report.Period : Date[1:366], format: "2024-01-01" "2024-01-02" ...
## $ Unit.Price....Gallon. : num [1:366] 3.77 3.97 3.19 3.88 3.83 3.49 2.68 3.49 3.65 3.64 ...
## $ Market.Reference.Price....Gallon.: num [1:366] 2.98 3.73 2.51 3.86 3.67 2.69 2.84 3.27 2.92 3.97 ...
## $ Trade.Terms : Factor w/ 3 levels "CIF","DAP","FOB": 1 3 1 1 3 2 1 3 3 2 ...
## $ Regional.Supply.Demand.Trends : Factor w/ 3 levels "Decreasing Demand",..: 3 3 2 3 3 3 2 2 2 2 ...
## $ Alternative.Energy.Impact : Factor w/ 3 levels "High","Low","Moderate": 1 3 3 3 3 2 3 1 2 1 ...
## $ Exchange.Rate.Risk.... : num [1:366] 3.4 1.46 2.09 2.72 4.2 1.21 1.82 4.99 2.6 2.65 ...
## $ Geopolitical.Risk : Factor w/ 3 levels "High","Low","Medium": 1 1 2 3 2 2 1 1 1 2 ...
## $ Yield.per.Barrel..Gallons. : num [1:366] 33 31 35 34 34 34 30 32 36 32 ...
#build linear regression model
set.seed(5678)
train_index <- sample(1:nrow(price), 0.8 * nrow(price))
train_data <- price[train_index, ]
test_data <- price[-train_index, ]
lm_model <- lm(`Unit.Price....Gallon.` ~ ., data = train_data)
train_preds <- predict(lm_model, newdata = train_data)
test_preds <- predict(lm_model, newdata = test_data)
# MSE
mse_train <- mean((train_data$`Unit.Price....Gallon.` - train_preds)^2)
mse_test <- mean((test_data$`Unit.Price....Gallon.` - test_preds)^2)
# RMSE
rmse_train <- sqrt(mse_train)
rmse_test <- sqrt(mse_test)
# R²
r_squared_train <- summary(lm_model)$r.squared
r_squared_test <- 1 - sum((test_preds - test_data$`Unit.Price....Gallon.`)^2) /
sum((test_data$`Unit.Price....Gallon.` - mean(test_data$`Unit.Price....Gallon.`))^2)
cat("📌 ****\n")
## 📌 ****
cat("✅ MSE:", mse_train, "\n")
## ✅ MSE: 0.1733605
cat("✅ MSE:", mse_test, "\n")
## ✅ MSE: 0.1601503
cat("✅ RMSE:", rmse_train, "\n")
## ✅ RMSE: 0.4163658
cat("✅ RMSE:", rmse_test, "\n")
## ✅ RMSE: 0.4001878
cat("✅ R²:", r_squared_train, "\n")
## ✅ R²: 0.04952773
cat("✅ R²:", r_squared_test, "\n")
## ✅ R²: 0.05635562
# predict the price
# 预测 test_data
test_preds <- predict(lm_model, newdata = test_data)
# 将预测结果添加到 test_data
test_data$Predicted_Unit_Price <- test_preds
# 计算 MSE 和 RMSE 评估预测效果
mse_test <- mean((test_data$`Unit.Price....Gallon.` - test_preds)^2)
rmse_test <- sqrt(mse_test)
r_squared_test <- 1 - sum((test_preds - test_data$`Unit.Price....Gallon.`)^2) /
sum((test_data$`Unit.Price....Gallon.` - mean(test_data$`Unit.Price....Gallon.`))^2)
# 打印预测结果
cat("📌 Test Data Prediction Results\n")
## 📌 Test Data Prediction Results
cat("✅ MSE:", mse_test, "\n")
## ✅ MSE: 0.1601503
cat("✅ RMSE:", rmse_test, "\n")
## ✅ RMSE: 0.4001878
cat("✅ R²:", r_squared_test, "\n")
## ✅ R²: 0.05635562
# 显示部分预测结果
head(test_data[, c("Report.Period", "Unit.Price....Gallon.", "Predicted_Unit_Price")])
## # A tibble: 6 × 3
## Report.Period Unit.Price....Gallon. Predicted_Unit_Price
## <date> <dbl> <dbl>
## 1 2024-01-07 2.68 3.15
## 2 2024-01-08 3.49 3.19
## 3 2024-01-16 3.65 3.35
## 4 2024-01-21 2.87 3.37
## 5 2024-01-24 3.51 3.38
## 6 2024-02-02 3.24 3.37