Instructions:

This project will showcase your ability to apply the concepts learned throughout the course.

The dataset you will use for this examination is provided as retail data.csv, which contains the following variables:

Product_ID: Unique identifier for each product.

Sales: Simulated sales numbers (in dollars).

Inventory_Levels: Inventory levels for each product.

Lead_Time_Days: The lead time in days for each product.

Price: The price of each product.

Seasonality_Index: An index representing seasonality.

# load dataset
retail_data <- read.csv("C:/Users/vitug/OneDrive/Desktop/CUNY Masters/DATA_605/synthetic_retail_data.csv")
# head
head(retail_data)

# summary
summary(retail_data)

##    Product_ID         Sales         Inventory_Levels Lead_Time_Days  
##  Min.   :  1.00   Min.   :  25.57   Min.   : 67.35   Min.   : 0.491  
##  1st Qu.: 50.75   1st Qu.: 284.42   1st Qu.:376.51   1st Qu.: 5.291  
##  Median :100.50   Median : 533.54   Median :483.72   Median : 6.765  
##  Mean   :100.50   Mean   : 636.92   Mean   :488.55   Mean   : 6.834  
##  3rd Qu.:150.25   3rd Qu.: 867.58   3rd Qu.:600.42   3rd Qu.: 8.212  
##  Max.   :200.00   Max.   :2447.49   Max.   :858.79   Max.   :12.722  
##      Price        Seasonality_Index
##  Min.   : 5.053   Min.   :0.3305   
##  1st Qu.:16.554   1st Qu.:0.8475   
##  Median :19.977   Median :0.9762   
##  Mean   :19.560   Mean   :0.9829   
##  3rd Qu.:22.924   3rd Qu.:1.1205   
##  Max.   :29.404   Max.   :1.5958

# data structure
str(retail_data)

## 'data.frame':    200 obs. of  6 variables:
##  $ Product_ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Sales            : num  158 279 699 1832 460 ...
##  $ Inventory_Levels : num  367 427 408 392 448 ...
##  $ Lead_Time_Days   : num  6.31 5.8 3.07 3.53 10.8 ...
##  $ Price            : num  18.8 26.1 22.4 27.1 18.3 ...
##  $ Seasonality_Index: num  1.184 0.857 0.699 0.698 0.841 ...

Problem 1: Business Risk and Revenue Modeling

Context: You are a data scientist working for a retail chain that models sales, inventory levels, and the impact of pricing and seasonality on revenue. Your task is to analyze various distributions that can describe sales variability and forecast potential revenue.

Part 1: Empirical and Theoretical Analysis of Distributions

Generate and Analyze Distributions:

X ~ Sales: Consider the Sales variable from the dataset. Assume it follows a Gamma distribution and estimate its shape and scale parameters using the fitdistr function from the MASS package.

# Fit Gamma distribution to Sales
sales_gamma_fit <- fitdistr(retail_data$Sales, "gamma")

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

sales_shape <- sales_gamma_fit$estimate["shape"]
sales_rate <- sales_gamma_fit$estimate["rate"]
sales_scale <- 1/sales_rate

# Create a histogram of Sales with the fitted Gamma distribution
hist(retail_data$Sales, freq = FALSE, breaks = 20, 
     main = "Histogram of Sales with Fitted Gamma Distribution",
     xlab = "Sales", col = "lightblue")
curve(dgamma(x, shape = sales_shape, rate = sales_rate), 
      add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Actual Data", "Gamma Distribution"), 
       col = c("lightblue", "red"), lwd = c(10, 2))

Y ~ Inventory Levels: Assume that the sum of inventory levels across similar products follows a Lognormal distribution. Estimate the parameters for this distribution.

# Fit Lognormal distribution to Inventory_Levels
inventory_lognormal_fit <- fitdistr(retail_data$Inventory_Levels, "lognormal")
inventory_meanlog <- inventory_lognormal_fit$estimate["meanlog"]
inventory_sdlog <- inventory_lognormal_fit$estimate["sdlog"]

# Create a histogram of Inventory_Levels with the fitted Lognormal distribution
hist(retail_data$Inventory_Levels, freq = FALSE, breaks = 20, 
     main = "Histogram of Inventory Levels with Fitted Lognormal Distribution",
     xlab = "Inventory Levels", col = "lightgreen")
curve(dlnorm(x, meanlog = inventory_meanlog, sdlog = inventory_sdlog), 
      add = TRUE, col = "blue", lwd = 2)
legend("topright", legend = c("Actual Data", "Lognormal Distribution"), 
       col = c("lightgreen", "blue"), lwd = c(10, 2))

Z ~ Lead Time: Assume that Lead_Time_Days follows a Normal distribution. Estimate the mean and standard deviation.

# Fit Normal distribution to Lead_Time_Days
leadtime_normal_fit <- fitdistr(retail_data$Lead_Time_Days, "normal")
leadtime_mean <- leadtime_normal_fit$estimate["mean"]
leadtime_sd <- leadtime_normal_fit$estimate["sd"]

# Create a histogram of Lead_Time_Days with the fitted Normal distribution
hist(retail_data$Lead_Time_Days, freq = FALSE, breaks = 20, 
     main = "Histogram of Lead Time Days with Fitted Normal Distribution",
     xlab = "Lead Time Days", col = "lightyellow")
curve(dnorm(x, mean = leadtime_mean, sd = leadtime_sd), 
      add = TRUE, col = "darkgreen", lwd = 2)
legend("topright", legend = c("Actual Data", "Normal Distribution"), 
       col = c("lightyellow", "darkgreen"), lwd = c(10, 2))

Calculate Empirical Expected Value and Variance:

Calculate the empirical mean and variance for all three variables.

# Calculate empirical statistics for all three variables
empirical_stats <- data.frame(
  Variable = c("Sales", "Inventory_Levels", "Lead_Time_Days"),
  Empirical_Mean = c(
    mean(retail_data$Sales),
    mean(retail_data$Inventory_Levels),
    mean(retail_data$Lead_Time_Days)
  ),
  Empirical_Variance = c(
    var(retail_data$Sales),
    var(retail_data$Inventory_Levels),
    var(retail_data$Lead_Time_Days)
  )
)

Compare these empirical values with the theoretical values derived from the estimated distribution parameters.

# Calculate theoretical values from estimated distribution parameters
theoretical_mean_sales <- sales_shape / sales_rate
theoretical_var_sales <- sales_shape / (sales_rate^2)

# For Lognormal
theoretical_mean_inventory <- exp(inventory_meanlog + (inventory_sdlog^2)/2)
theoretical_var_inventory <- (exp(inventory_sdlog^2) - 1) * exp(2*inventory_meanlog + inventory_sdlog^2)

# For Normal
theoretical_mean_leadtime <- leadtime_mean
theoretical_var_leadtime <- leadtime_sd^2

theoretical_stats <- data.frame(
  Variable = c("Sales", "Inventory_Levels", "Lead_Time_Days"),
  Theoretical_Mean = c(
    theoretical_mean_sales,
    theoretical_mean_inventory,
    theoretical_mean_leadtime
  ),
  Theoretical_Variance = c(
    theoretical_var_sales,
    theoretical_var_inventory,
    theoretical_var_leadtime
  )
)

# Combine empirical and theoretical statistics
comparison_stats <- merge(empirical_stats, theoretical_stats, by = "Variable")

# Calculate percent differences
comparison_stats$Mean_Pct_Diff <- 100 * abs(comparison_stats$Empirical_Mean - comparison_stats$Theoretical_Mean) / comparison_stats$Empirical_Mean
comparison_stats$Var_Pct_Diff <- 100 * abs(comparison_stats$Empirical_Variance - comparison_stats$Theoretical_Variance) / comparison_stats$Empirical_Variance

# Create a comparison table
  kable(comparison_stats, digits = 3, caption = "Comparison of Empirical vs Theoretical Values")

Comparison of Empirical vs Theoretical Values
Variable	Empirical_Mean	Empirical_Variance	Theoretical_Mean	Theoretical_Variance	Mean_Pct_Diff	Var_Pct_Diff
Inventory_Levels	488.547	24039.446	492.276	34197.47	0.763	42.256
Lead_Time_Days	6.834	4.362	6.834	4.34	0.000	0.500
Sales	636.916	214831.751	636.915	221073.18	0.000	2.905

# Visual comparison of empirical vs theoretical means
barplot(rbind(comparison_stats$Empirical_Mean, comparison_stats$Theoretical_Mean), 
        beside = TRUE, 
        col = c("darkblue", "lightblue"),
        names.arg = comparison_stats$Variable,
        main = "Comparison of Empirical vs Theoretical Means",
        legend.text = c("Empirical Mean", "Theoretical Mean"))

# Visual comparison of empirical vs theoretical variances
barplot(rbind(comparison_stats$Empirical_Variance, comparison_stats$Theoretical_Variance), 
        beside = TRUE, 
        col = c("darkred", "pink"),
        names.arg = comparison_stats$Variable,
        main = "Comparison of Empirical vs Theoretical Variances",
        legend.text = c("Empirical Variance", "Theoretical Variance"))

Part 2: Probability Analysis and Independence Testing

Empirical Probabilities

For the Lead_Time_Days variable (assumed to be normally distributed), calculate the following empirical probabilities:

\[(Z > \mu \mid Z > \mu - \sigma)\] \[P(Z > \mu + \sigma \mid Z > \mu)\] \[P(Z > \mu + 2\sigma \mid Z > \mu)\]

# create lead_time function
lead_time <- retail_data$Lead_Time_Days

# Calculate the mean and standard deviation
mean_lt <- mean(lead_time)
sd_lt <- sd(lead_time)

# Calculate empirical probabilities p1
p1_numerator <- sum(lead_time > mean_lt)
p1_denominator <- sum(lead_time > (mean_lt - sd_lt))
p1 <- p1_numerator / p1_denominator

# Calculate empirical probabilities p2
p2_numerator <- sum(lead_time > (mean_lt + sd_lt))
p2_denominator <- sum(lead_time > mean_lt)
p2 <- p2_numerator / p2_denominator

# Calculate empirical probabilities p3
p3_numerator <- sum(lead_time > (mean_lt + 2*sd_lt))
p3_denominator <- sum(lead_time > mean_lt)
p3 <- p3_numerator / p3_denominator

# Display the results
cat("Empirical Conditional Probabilities for Lead Time:\n")

## Empirical Conditional Probabilities for Lead Time:

cat("P(Z > μ | Z > μ - σ) =", p1, "\n")

## P(Z > μ | Z > μ - σ) = 0.5928144

cat("P(Z > μ + σ | Z > μ) =", p2, "\n")

## P(Z > μ + σ | Z > μ) = 0.3131313

cat("P(Z > μ + 2σ | Z > μ) =", p3, "\n")

## P(Z > μ + 2σ | Z > μ) = 0.03030303

# Compare with theoretical conditional probabilities for a normal distribution
cat("\nTheoretical Conditional Probabilities for Normal Distribution:\n")

## 
## Theoretical Conditional Probabilities for Normal Distribution:

cat("P(Z > μ | Z > μ - σ) =", 0.5/0.8413, "\n")

## P(Z > μ | Z > μ - σ) = 0.5943183

cat("P(Z > μ + σ | Z > μ) =", 0.1587/0.5, "\n")

## P(Z > μ + σ | Z > μ) = 0.3174

cat("P(Z > μ + 2σ | Z > μ) =", 0.0228/0.5, "\n")

## P(Z > μ + 2σ | Z > μ) = 0.0456

Correlation and Independence:

Investigate the correlation between Sales and Price. Create a contingency table using quartiles of Sales and Price, and then evaluate the marginal and joint probabilities.

# Create quartiles for Sales and Price
sales_quartiles <- cut(retail_data$Sales, breaks = quantile(retail_data$Sales, probs = seq(0, 1, 0.25)), 
                       labels = c("Q1", "Q2", "Q3", "Q4"), include.lowest = TRUE)
price_quartiles <- cut(retail_data$Price, breaks = quantile(retail_data$Price, probs = seq(0, 1, 0.25)), 
                       labels = c("Q1", "Q2", "Q3", "Q4"), include.lowest = TRUE)

# Create a contingency table
contingency_table <- table(sales_quartiles, price_quartiles)
print("Contingency Table of Sales Quartiles vs Price Quartiles:")

## [1] "Contingency Table of Sales Quartiles vs Price Quartiles:"

print(contingency_table)

##                price_quartiles
## sales_quartiles Q1 Q2 Q3 Q4
##              Q1 11 16 12 11
##              Q2 13 10 15 12
##              Q3 15 10 13 12
##              Q4 11 14 10 15

# Calculate marginal probabilities
sales_marginal <- prop.table(rowSums(contingency_table))
price_marginal <- prop.table(colSums(contingency_table))

print("Marginal Probabilities for Sales Quartiles:")

## [1] "Marginal Probabilities for Sales Quartiles:"

print(sales_marginal)

##   Q1   Q2   Q3   Q4 
## 0.25 0.25 0.25 0.25

print("Marginal Probabilities for Price Quartiles:")

## [1] "Marginal Probabilities for Price Quartiles:"

print(price_marginal)

##   Q1   Q2   Q3   Q4 
## 0.25 0.25 0.25 0.25

# Calculate joint probabilities
joint_probabilities <- prop.table(contingency_table)
print("Joint Probabilities:")

## [1] "Joint Probabilities:"

print(joint_probabilities)

##                price_quartiles
## sales_quartiles    Q1    Q2    Q3    Q4
##              Q1 0.055 0.080 0.060 0.055
##              Q2 0.065 0.050 0.075 0.060
##              Q3 0.075 0.050 0.065 0.060
##              Q4 0.055 0.070 0.050 0.075

Use Fisher’s Exact Test and the Chi-Square Test to check for independence between Sales and Price. Discuss which test is most appropriate and why.

# Fisher's Exact Test for independence
fisher_test <- fisher.test(contingency_table, simulate.p.value = TRUE, B = 10000)
print("Fisher's Exact Test:")

## [1] "Fisher's Exact Test:"

print(fisher_test)

## 
##  Fisher's Exact Test for Count Data with simulated p-value (based on
##  10000 replicates)
## 
## data:  contingency_table
## p-value = 0.863
## alternative hypothesis: two.sided

# Chi-Square Test for independence
chi_square_test <- chisq.test(contingency_table)
print("Chi-Square Test:")

## [1] "Chi-Square Test:"

print(chi_square_test)

## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 4.8, df = 9, p-value = 0.8514

# Calculate Pearson correlation coefficient
correlation <- cor.test(retail_data$Sales, retail_data$Price)
print("Pearson Correlation Test:")

## [1] "Pearson Correlation Test:"

print(correlation)

## 
##  Pearson's product-moment correlation
## 
## data:  retail_data$Sales and retail_data$Price
## t = 1.4532, df = 198, p-value = 0.1478
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03653442  0.23807516
## sample estimates:
##       cor 
## 0.1027273

Problem 2: Advanced Forecasting and Optimization (Calculus) in Retail

Context: You are working for a large retail chain that wants to optimize pricing, inventory management, and sales forecasting using data-driven strategies. Your task is to use regression, statistical modeling, and calculus-based methods to make informed decisions.

Part 1: Descriptive and Inferential Statistics for Inventory Data

Inventory Data Analysis:

Generate univariate descriptive statistics for the Inventory_Levels and Sales variables.

# Univariate descriptive statistics for Inventory_Levels and Sales
cat("Descriptive Statistics for Inventory Levels:\n")

## Descriptive Statistics for Inventory Levels:

print(summary(retail_data$Inventory_Levels))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   67.35  376.51  483.72  488.55  600.42  858.79

cat("\nStandard Deviation:", sd(retail_data$Inventory_Levels), "\n")

## 
## Standard Deviation: 155.0466

cat("Skewness:", moments::skewness(retail_data$Inventory_Levels), "\n")

## Skewness: 0.09397099

cat("Kurtosis:", moments::kurtosis(retail_data$Inventory_Levels), "\n\n")

## Kurtosis: 2.608498

cat("Descriptive Statistics for Sales:\n")

## Descriptive Statistics for Sales:

print(summary(retail_data$Sales))

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.57  284.42  533.54  636.92  867.58 2447.49

cat("\nStandard Deviation:", sd(retail_data$Sales), "\n")

## 
## Standard Deviation: 463.4995

cat("Skewness:", moments::skewness(retail_data$Sales), "\n")

## Skewness: 1.226333

cat("Kurtosis:", moments::kurtosis(retail_data$Sales), "\n")

## Kurtosis: 4.574007

Create appropriate visualizations such as histograms and scatterplots for Inventory_Levels, Sales, and Price.

# Visualizations
# Histogram for Inventory_Levels
ggplot(retail_data, aes(x = Inventory_Levels)) +
  geom_histogram(bins = 30, fill = "steelblue", color = "black") +
  labs(title = "Distribution of Inventory Levels", x = "Inventory Levels", y = "Frequency") +
  theme_minimal()

# Histogram for Sales
ggplot(retail_data, aes(x = Sales)) +
  geom_histogram(bins = 30, fill = "coral", color = "black") +
  labs(title = "Distribution of Sales", x = "Sales", y = "Frequency") +
  theme_minimal()

# Histogram for Price
ggplot(retail_data, aes(x = Price)) +
  geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
  labs(title = "Distribution of Price", x = "Price", y = "Frequency") +
  theme_minimal()

# Scatterplot for Sales vs. Inventory_Levels
ggplot(retail_data, aes(x = Sales, y = Inventory_Levels)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Sales vs. Inventory Levels", x = "Sales", y = "Inventory Levels") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

# Scatterplot for Sales vs. Price
ggplot(retail_data, aes(x = Price, y = Sales)) +
  geom_point(color = "darkgreen", alpha = 0.6) +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Price vs. Sales", x = "Price", y = "Sales") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Compute a correlation matrix for Sales, Price, and Inventory_Levels.

# Compute correlation matrix
cor_matrix <- cor(retail_data[, c("Sales", "Price", "Inventory_Levels")])
print("Correlation Matrix:")

## [1] "Correlation Matrix:"

print(cor_matrix)

##                        Sales       Price Inventory_Levels
## Sales             1.00000000  0.10272730      -0.03529619
## Price             0.10272730  1.00000000      -0.04025941
## Inventory_Levels -0.03529619 -0.04025941       1.00000000

# Visualize correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper", 
         addCoef.col = "black", tl.col = "black", tl.srt = 45)

Test the hypotheses that the correlations between the variables are zero and provide a 95% confidence interval.

# Sales vs. Price
cor_test_sales_price <- cor.test(retail_data$Sales, retail_data$Price)
print("Correlation Test: Sales vs. Price")

## [1] "Correlation Test: Sales vs. Price"

print(cor_test_sales_price)

## 
##  Pearson's product-moment correlation
## 
## data:  retail_data$Sales and retail_data$Price
## t = 1.4532, df = 198, p-value = 0.1478
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03653442  0.23807516
## sample estimates:
##       cor 
## 0.1027273

# Sales vs. Inventory_Levels
cor_test_sales_inventory <- cor.test(retail_data$Sales, retail_data$Inventory_Levels)
print("Correlation Test: Sales vs. Inventory_Levels")

## [1] "Correlation Test: Sales vs. Inventory_Levels"

print(cor_test_sales_inventory)

## 
##  Pearson's product-moment correlation
## 
## data:  retail_data$Sales and retail_data$Inventory_Levels
## t = -0.49697, df = 198, p-value = 0.6198
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1731891  0.1039539
## sample estimates:
##         cor 
## -0.03529619

# Price vs. Inventory_Levels
cor_test_price_inventory <- cor.test(retail_data$Price, retail_data$Inventory_Levels)
print("Correlation Test: Price vs. Inventory_Levels")

## [1] "Correlation Test: Price vs. Inventory_Levels"

print(cor_test_price_inventory)

## 
##  Pearson's product-moment correlation
## 
## data:  retail_data$Price and retail_data$Inventory_Levels
## t = -0.56696, df = 198, p-value = 0.5714
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.17800614  0.09903478
## sample estimates:
##         cor 
## -0.04025941

Discussion:

Explain the meaning of your findings and discuss the implications of the correlations for inventory management. Would you be concerned about multicollinearity in a potential regression model? Why or why not?

Based on the correlation analysis and descriptive statistics, we found several insights relevant to inventory management, some of the most relevant are:

The correlation between Sales and Inventory Levels The analysis shows a very weak and negative metric values (-0.035), (p-value = 0.6198). This suggests that sales volume and inventory levels are managed independently in the dataset. Also, there is no evidence that higher sales correspond to either higher or lower inventory levels, indicating that inventory might be determined by other factors such as lead time or restocking policies rather than being directly tied to sales volume.

Correlation between Sales and Price The correlation is positive but weak (0.103) and not statistically significant (p-value = 0.1478). This is interesting because economic theory often predicts a negative correlation. The weak positive correlation might suggest that higher-priced items in this dataset also happen to be popular items.

Implications for Inventory Management Given these correlations, inventory managers should consider the following recommendations: traditional inventory models may need adjustment; the lack of strong correlations between these key variables suggests that standard inventory models that assume relationships between sales, inventory, and price may not be directly applicable to this retail context.

Inventory decisions appear decoupled from sales patterns: The data suggests inventory decisions may be made using fixed reorder points or time-based replenishment rather than being dynamically adjusted based on sales patterns.

Data-driven inventory optimization opportunity: The weak relationships present an opportunity to implement more sophisticated inventory management strategies that could better align inventory with actual demand patterns.

Multicollinearity Concerns For regression modeling purposes, multicollinearity would not be a concern based on the correlation matrix. All pairwise correlations between Sales, Price, and Inventory_Levels are very low, far from where multicollinearity becomes problematic.

Part 2: Linear Algebra and Pricing Strategy

Price Elasticity of Demand

Use linear regression to model the relationship between Sales and Price (assuming Sales as the dependent variable).

# Linear regression model for Sales and Price
price_sales_model <- lm(Sales ~ Price, data = retail_data)
summary(price_sales_model)

## 
## Call:
## lm(formula = Sales ~ Price, data = retail_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -679.54 -347.85  -98.63  241.12 1770.08 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  442.951    137.419   3.223  0.00148 **
## Price          9.916      6.824   1.453  0.14775   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 462.2 on 198 degrees of freedom
## Multiple R-squared:  0.01055,    Adjusted R-squared:  0.005556 
## F-statistic: 2.112 on 1 and 198 DF,  p-value: 0.1478

# Extract the regression coefficients
intercept <- coef(price_sales_model)[1]
price_coefficient <- coef(price_sales_model)[2]

# Calculate price elasticity of demand at the mean price
mean_price <- mean(retail_data$Price)
mean_sales <- mean(retail_data$Sales)
price_elasticity <- price_coefficient * (mean_price / mean_sales)

cat("Price Elasticity of Demand at Mean Price:", price_elasticity, "\n")

## Price Elasticity of Demand at Mean Price: 0.3045387

Invert the correlation matrix from your model, and calculate the precision matrix.

# Correlation matrix of variables used in the model
model_vars <- model.matrix(price_sales_model)[, -1] # exclude intercept
cor_matrix <- cor(cbind(model_vars, retail_data$Sales))
print("Correlation Matrix:")

## [1] "Correlation Matrix:"

print(cor_matrix)

##            model_vars          
## model_vars  1.0000000 0.1027273
##             0.1027273 1.0000000

# Invert correlation matrix to get precision matrix
precision_matrix <- solve(cor_matrix)
print("Precision Matrix:")

## [1] "Precision Matrix:"

print(precision_matrix)

##            model_vars           
## model_vars  1.0106655 -0.1038229
##            -0.1038229  1.0106655

Discuss the implications of the diagonal elements of the precision matrix (which are variance inflation factors).

# Discuss implications of diagonal elements (VIFs)
vif_value <- diag(precision_matrix)
cat("Variance Inflation Factors (Diagonal elements of precision matrix):", vif_value, "\n")

## Variance Inflation Factors (Diagonal elements of precision matrix): 1.010665 1.010665

Perform LU decomposition on the correlation matrix and interpret the results in the context of price elasticity.

# Perform LU decomposition on the correlation matrix
L <- matrix(c(1, cor_matrix[2,1]/cor_matrix[1,1], 0, 1), nrow=2)
U <- matrix(c(cor_matrix[1,1], cor_matrix[1,2], 
                    0, cor_matrix[2,2] - L[2,1]*cor_matrix[1,2]), nrow=2)

print("L Matrix:")

## [1] "L Matrix:"

print(L)

##           [,1] [,2]
## [1,] 1.0000000    0
## [2,] 0.1027273    1

print("U Matrix:")

## [1] "U Matrix:"

print(U)

##           [,1]      [,2]
## [1,] 1.0000000 0.0000000
## [2,] 0.1027273 0.9894471

Part 3: Calculus-Based Probability & Statistics for Sales Forecasting

Sales Forecasting Using Exponential Distribution

Identify a variable in the dataset that is skewed to the right (e.g., Sales or Price) and fit an exponential distribution to this data using the fitdistr function.

# Fit an exponential distribution to Sales
sales_exp_fit <- fitdistr(retail_data$Sales, "exponential")
rate_parameter <- sales_exp_fit$estimate["rate"]

# Create a histogram of the original data
hist(retail_data$Sales, freq = FALSE, breaks = 20, 
     main = "Sales Distribution with Fitted Exponential Curve",
     xlab = "Sales", col = "lightblue")
curve(dexp(x, rate = rate_parameter), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Actual Sales Data", "Fitted Exponential Distribution"), 
       col = c("lightblue", "red"), lwd = c(10, 2))

Generate 1,000 samples from the fitted exponential distribution and compare a histogram of these samples with the original data’s histogram.

# Generate 1,000 samples from the fitted exponential distribution
set.seed(123) 
simulated_sales <- rexp(1000, rate = rate_parameter)

# Compare histograms of original and simulated data
par(mfrow = c(1, 2))
hist(retail_data$Sales, main = "Original Sales Data", xlab = "Sales", col = "lightblue")
hist(simulated_sales, main = "Simulated Sales Data (Exponential)", xlab = "Sales", col = "pink")

par(mfrow = c(1, 1))

Calculate the 5th and 95th percentiles using the cumulative distribution function (CDF) of the exponential distribution.

# Calculate the 5th and 95th percentiles using the CDF of exponential distribution
percentile_5th <- qexp(0.05, rate = rate_parameter)
percentile_95th <- qexp(0.95, rate = rate_parameter)

cat("5th Percentile from Exponential Distribution:", percentile_5th, "\n")

## 5th Percentile from Exponential Distribution: 32.66953

cat("95th Percentile from Exponential Distribution:", percentile_95th, "\n")

## 95th Percentile from Exponential Distribution: 1908.03

# Calculate empirical 5th and 95th percentiles from the actual data
empirical_5th <- quantile(retail_data$Sales, 0.05)
empirical_95th <- quantile(retail_data$Sales, 0.95)

cat("Empirical 5th Percentile:", empirical_5th, "\n")

## Empirical 5th Percentile: 104.9028

cat("Empirical 95th Percentile:", empirical_95th, "\n")

## Empirical 95th Percentile: 1502.25

Compute a 95% confidence interval for the original data assuming normality and compare it with the empirical percentiles.

# Compute a 95% confidence interval assuming normality
sales_mean <- mean(retail_data$Sales)
sales_sd <- sd(retail_data$Sales)
n <- length(retail_data$Sales)
margin_error <- qt(0.975, df = n-1) * sales_sd / sqrt(n)
ci_lower <- sales_mean - margin_error
ci_upper <- sales_mean + margin_error

cat("95% Confidence Interval (assuming normality):\n")

## 95% Confidence Interval (assuming normality):

cat("Lower Bound:", ci_lower, "\n")

## Lower Bound: 572.2866

cat("Upper Bound:", ci_upper, "\n")

## Upper Bound: 701.5458

Discussion:

Discuss how well the exponential distribution models the data and what this implies for forecasting future sales or pricing. Consider whether a different distribution might be more appropriate.

Based on the comparison between the exponential distribution model and the actual sales data we can come up with some conclusions:

The exponential distribution appears to be an optimal fit for the sales data, as evidenced by multiple diagnostic indicators, such as: Visualizations, The histogram of the original sales data shows a right-skewed distribution, but with a different shape than the fitted exponential curve. Percentile Comparison, there are significant discrepancies between the theoretical and empirical percentiles. Simulated vs. Actual Data: The histogram comparison between the simulated exponential data and the original sales data reveals different distribution shapes.

The implications for sales forecasting can be determine by the following factors:

Overestimation of Extreme Values: The model would predict more extremely low sales values and more extremely high sales values than what actually occurs in the data.

Inaccurate Risk Assessment: Relying on this model could lead to incorrect inventory planning and resource allocation due to the overestimation of tail probabilities.

Unreliable Confidence Intervals: The 95% confidence interval assuming normality (572.29 to 701.55) is quite narrow compared to the range of the actual data, suggesting that neither the normal nor exponential distributions adequately capture the variability in the sales data.

For this particular analysis, I can suggest the Gamma or Weibull distributions, These models offer more flexibility with two parameters (shape and scale) that can better accommodate the observed sales pattern.

Part 4: Regression Modeling for Inventory Optimization

Multiple Regression Model:

Build a multiple regression model to predict Inventory_Levels based on Sales, Lead_Time_Days, and Price.

# Build multiple regression model
inventory_model <- lm(Inventory_Levels ~ Sales + Lead_Time_Days + Price, data = retail_data)

Provide a full summary of your model, including coefficients, R-squared value, and residual analysis.

# Full summary of the model
summary(inventory_model)

## 
## Call:
## lm(formula = Inventory_Levels ~ Sales + Lead_Time_Days + Price, 
##     data = retail_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -395.54 -118.07   -7.68  111.81  372.56 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    464.792662  61.321852   7.580 1.35e-12 ***
## Sales           -0.007809   0.023955  -0.326    0.745    
## Lead_Time_Days   7.316793   5.293049   1.382    0.168    
## Price           -1.087778   2.305846  -0.472    0.638    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 155.3 on 196 degrees of freedom
## Multiple R-squared:  0.01223,    Adjusted R-squared:  -0.002887 
## F-statistic: 0.8091 on 3 and 196 DF,  p-value: 0.4902

# Residual analysis
par(mfrow = c(2, 2))
plot(inventory_model)

par(mfrow = c(1, 1))

# Calculate VIF to check for multicollinearity
vif(inventory_model)

##          Sales Lead_Time_Days          Price 
##       1.017562       1.008633       1.011822

# Predict inventory levels for different scenarios
peak_sales <- 1.2 * mean(retail_data$Sales)
mean_lead_time <- mean(retail_data$Lead_Time_Days)
competitive_price <- 0.95 * mean(retail_data$Price)
# create dataframe
peak_scenario <- data.frame(
  Sales = peak_sales,
  Lead_Time_Days = mean_lead_time,
  Price = competitive_price
)

# Predict inventory levels for peak scenario
predicted_inventory <- predict(inventory_model, newdata = peak_scenario, interval = "prediction")
print("Predicted Inventory Levels for Peak Season:")

## [1] "Predicted Inventory Levels for Peak Season:"

print(predicted_inventory)

##        fit      lwr      upr
## 1 488.6163 181.5371 795.6956

Optimization:

Use your model to optimize inventory levels for a peak sales season, balancing minimizing stockouts with minimizing overstock.

# Create a function for optimizing inventory based on the model
stockout_cost_per_unit <- 2
holding_cost_per_unit <- 0.2

inventory_cost <- function(inventory_level, predicted_demand, stockout_cost, holding_cost) {
  if (inventory_level < predicted_demand) {
    # Stockout cost
    return((predicted_demand - inventory_level) * stockout_cost)
  } else {
    # Holding cost for excess inventory
    return((inventory_level - predicted_demand) * holding_cost)
  }
}

# Predicted demand during peak season
predicted_demand <- peak_sales

# Search for optimal inventory level
inventory_range <- seq(predicted_demand * 0.8, predicted_demand * 1.2, by = 10)
costs <- sapply(inventory_range, function(inv) {
  inventory_cost(inv, predicted_demand, stockout_cost_per_unit, holding_cost_per_unit)
})

optimal_index <- which.min(costs)
optimal_inventory <- inventory_range[optimal_index]

cat("Optimal Inventory Level for Peak Season:", optimal_inventory, "\n")

## Optimal Inventory Level for Peak Season: 771.4396

cat("Associated Cost:", costs[optimal_index], "\n")

## Associated Cost: 1.428022

# Plot the cost function
plot(inventory_range, costs, type = "l", col = "blue", 
     main = "Inventory Cost Function", xlab = "Inventory Level", ylab = "Cost")
abline(v = optimal_inventory, col = "red", lty = 2)
text(optimal_inventory, max(costs) * 0.9, "Optimal", pos = 4, col = "red")

Final Examination: Business Analytics and Data Science

Victor Torres

2025-05-17

Instructions:

Problem 1: Business Risk and Revenue Modeling

Part 1: Empirical and Theoretical Analysis of Distributions

Generate and Analyze Distributions:

Calculate Empirical Expected Value and Variance:

Part 2: Probability Analysis and Independence Testing

Empirical Probabilities

Correlation and Independence:

Problem 2: Advanced Forecasting and Optimization (Calculus) in Retail

Part 1: Descriptive and Inferential Statistics for Inventory Data

Inventory Data Analysis:

Discussion:

Part 2: Linear Algebra and Pricing Strategy

Price Elasticity of Demand

Part 3: Calculus-Based Probability & Statistics for Sales Forecasting

Sales Forecasting Using Exponential Distribution

Discussion:

Part 4: Regression Modeling for Inventory Optimization

Multiple Regression Model:

Optimization: