This project will showcase your ability to apply the concepts learned throughout the course.
The dataset you will use for this examination is provided as retail data.csv, which contains the following variables:
Product_ID: Unique identifier for each product.
Sales: Simulated sales numbers (in dollars).
Inventory_Levels: Inventory levels for each product.
Lead_Time_Days: The lead time in days for each product.
Price: The price of each product.
Seasonality_Index: An index representing seasonality.
# load dataset
retail_data <- read.csv("C:/Users/vitug/OneDrive/Desktop/CUNY Masters/DATA_605/synthetic_retail_data.csv")
# head
head(retail_data)
# summary
summary(retail_data)
## Product_ID Sales Inventory_Levels Lead_Time_Days
## Min. : 1.00 Min. : 25.57 Min. : 67.35 Min. : 0.491
## 1st Qu.: 50.75 1st Qu.: 284.42 1st Qu.:376.51 1st Qu.: 5.291
## Median :100.50 Median : 533.54 Median :483.72 Median : 6.765
## Mean :100.50 Mean : 636.92 Mean :488.55 Mean : 6.834
## 3rd Qu.:150.25 3rd Qu.: 867.58 3rd Qu.:600.42 3rd Qu.: 8.212
## Max. :200.00 Max. :2447.49 Max. :858.79 Max. :12.722
## Price Seasonality_Index
## Min. : 5.053 Min. :0.3305
## 1st Qu.:16.554 1st Qu.:0.8475
## Median :19.977 Median :0.9762
## Mean :19.560 Mean :0.9829
## 3rd Qu.:22.924 3rd Qu.:1.1205
## Max. :29.404 Max. :1.5958
# data structure
str(retail_data)
## 'data.frame': 200 obs. of 6 variables:
## $ Product_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Sales : num 158 279 699 1832 460 ...
## $ Inventory_Levels : num 367 427 408 392 448 ...
## $ Lead_Time_Days : num 6.31 5.8 3.07 3.53 10.8 ...
## $ Price : num 18.8 26.1 22.4 27.1 18.3 ...
## $ Seasonality_Index: num 1.184 0.857 0.699 0.698 0.841 ...
Context: You are a data scientist working for a retail chain that models sales, inventory levels, and the impact of pricing and seasonality on revenue. Your task is to analyze various distributions that can describe sales variability and forecast potential revenue.
X ~ Sales: Consider the Sales variable from the dataset. Assume it follows a Gamma distribution and estimate its shape and scale parameters using the fitdistr function from the MASS package.
# Fit Gamma distribution to Sales
sales_gamma_fit <- fitdistr(retail_data$Sales, "gamma")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
sales_shape <- sales_gamma_fit$estimate["shape"]
sales_rate <- sales_gamma_fit$estimate["rate"]
sales_scale <- 1/sales_rate
# Create a histogram of Sales with the fitted Gamma distribution
hist(retail_data$Sales, freq = FALSE, breaks = 20,
main = "Histogram of Sales with Fitted Gamma Distribution",
xlab = "Sales", col = "lightblue")
curve(dgamma(x, shape = sales_shape, rate = sales_rate),
add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Actual Data", "Gamma Distribution"),
col = c("lightblue", "red"), lwd = c(10, 2))
Y ~ Inventory Levels: Assume that the sum of inventory levels across similar products follows a Lognormal distribution. Estimate the parameters for this distribution.
# Fit Lognormal distribution to Inventory_Levels
inventory_lognormal_fit <- fitdistr(retail_data$Inventory_Levels, "lognormal")
inventory_meanlog <- inventory_lognormal_fit$estimate["meanlog"]
inventory_sdlog <- inventory_lognormal_fit$estimate["sdlog"]
# Create a histogram of Inventory_Levels with the fitted Lognormal distribution
hist(retail_data$Inventory_Levels, freq = FALSE, breaks = 20,
main = "Histogram of Inventory Levels with Fitted Lognormal Distribution",
xlab = "Inventory Levels", col = "lightgreen")
curve(dlnorm(x, meanlog = inventory_meanlog, sdlog = inventory_sdlog),
add = TRUE, col = "blue", lwd = 2)
legend("topright", legend = c("Actual Data", "Lognormal Distribution"),
col = c("lightgreen", "blue"), lwd = c(10, 2))
Z ~ Lead Time: Assume that Lead_Time_Days follows a Normal distribution. Estimate the mean and standard deviation.
# Fit Normal distribution to Lead_Time_Days
leadtime_normal_fit <- fitdistr(retail_data$Lead_Time_Days, "normal")
leadtime_mean <- leadtime_normal_fit$estimate["mean"]
leadtime_sd <- leadtime_normal_fit$estimate["sd"]
# Create a histogram of Lead_Time_Days with the fitted Normal distribution
hist(retail_data$Lead_Time_Days, freq = FALSE, breaks = 20,
main = "Histogram of Lead Time Days with Fitted Normal Distribution",
xlab = "Lead Time Days", col = "lightyellow")
curve(dnorm(x, mean = leadtime_mean, sd = leadtime_sd),
add = TRUE, col = "darkgreen", lwd = 2)
legend("topright", legend = c("Actual Data", "Normal Distribution"),
col = c("lightyellow", "darkgreen"), lwd = c(10, 2))
Calculate the empirical mean and variance for all three variables.
# Calculate empirical statistics for all three variables
empirical_stats <- data.frame(
Variable = c("Sales", "Inventory_Levels", "Lead_Time_Days"),
Empirical_Mean = c(
mean(retail_data$Sales),
mean(retail_data$Inventory_Levels),
mean(retail_data$Lead_Time_Days)
),
Empirical_Variance = c(
var(retail_data$Sales),
var(retail_data$Inventory_Levels),
var(retail_data$Lead_Time_Days)
)
)
Compare these empirical values with the theoretical values derived from the estimated distribution parameters.
# Calculate theoretical values from estimated distribution parameters
theoretical_mean_sales <- sales_shape / sales_rate
theoretical_var_sales <- sales_shape / (sales_rate^2)
# For Lognormal
theoretical_mean_inventory <- exp(inventory_meanlog + (inventory_sdlog^2)/2)
theoretical_var_inventory <- (exp(inventory_sdlog^2) - 1) * exp(2*inventory_meanlog + inventory_sdlog^2)
# For Normal
theoretical_mean_leadtime <- leadtime_mean
theoretical_var_leadtime <- leadtime_sd^2
theoretical_stats <- data.frame(
Variable = c("Sales", "Inventory_Levels", "Lead_Time_Days"),
Theoretical_Mean = c(
theoretical_mean_sales,
theoretical_mean_inventory,
theoretical_mean_leadtime
),
Theoretical_Variance = c(
theoretical_var_sales,
theoretical_var_inventory,
theoretical_var_leadtime
)
)
# Combine empirical and theoretical statistics
comparison_stats <- merge(empirical_stats, theoretical_stats, by = "Variable")
# Calculate percent differences
comparison_stats$Mean_Pct_Diff <- 100 * abs(comparison_stats$Empirical_Mean - comparison_stats$Theoretical_Mean) / comparison_stats$Empirical_Mean
comparison_stats$Var_Pct_Diff <- 100 * abs(comparison_stats$Empirical_Variance - comparison_stats$Theoretical_Variance) / comparison_stats$Empirical_Variance
# Create a comparison table
kable(comparison_stats, digits = 3, caption = "Comparison of Empirical vs Theoretical Values")
| Variable | Empirical_Mean | Empirical_Variance | Theoretical_Mean | Theoretical_Variance | Mean_Pct_Diff | Var_Pct_Diff |
|---|---|---|---|---|---|---|
| Inventory_Levels | 488.547 | 24039.446 | 492.276 | 34197.47 | 0.763 | 42.256 |
| Lead_Time_Days | 6.834 | 4.362 | 6.834 | 4.34 | 0.000 | 0.500 |
| Sales | 636.916 | 214831.751 | 636.915 | 221073.18 | 0.000 | 2.905 |
# Visual comparison of empirical vs theoretical means
barplot(rbind(comparison_stats$Empirical_Mean, comparison_stats$Theoretical_Mean),
beside = TRUE,
col = c("darkblue", "lightblue"),
names.arg = comparison_stats$Variable,
main = "Comparison of Empirical vs Theoretical Means",
legend.text = c("Empirical Mean", "Theoretical Mean"))
# Visual comparison of empirical vs theoretical variances
barplot(rbind(comparison_stats$Empirical_Variance, comparison_stats$Theoretical_Variance),
beside = TRUE,
col = c("darkred", "pink"),
names.arg = comparison_stats$Variable,
main = "Comparison of Empirical vs Theoretical Variances",
legend.text = c("Empirical Variance", "Theoretical Variance"))
For the Lead_Time_Days variable (assumed to be normally distributed), calculate the following empirical probabilities:
\[(Z > \mu \mid Z > \mu - \sigma)\] \[P(Z > \mu + \sigma \mid Z > \mu)\] \[P(Z > \mu + 2\sigma \mid Z > \mu)\]
# create lead_time function
lead_time <- retail_data$Lead_Time_Days
# Calculate the mean and standard deviation
mean_lt <- mean(lead_time)
sd_lt <- sd(lead_time)
# Calculate empirical probabilities p1
p1_numerator <- sum(lead_time > mean_lt)
p1_denominator <- sum(lead_time > (mean_lt - sd_lt))
p1 <- p1_numerator / p1_denominator
# Calculate empirical probabilities p2
p2_numerator <- sum(lead_time > (mean_lt + sd_lt))
p2_denominator <- sum(lead_time > mean_lt)
p2 <- p2_numerator / p2_denominator
# Calculate empirical probabilities p3
p3_numerator <- sum(lead_time > (mean_lt + 2*sd_lt))
p3_denominator <- sum(lead_time > mean_lt)
p3 <- p3_numerator / p3_denominator
# Display the results
cat("Empirical Conditional Probabilities for Lead Time:\n")
## Empirical Conditional Probabilities for Lead Time:
cat("P(Z > μ | Z > μ - σ) =", p1, "\n")
## P(Z > μ | Z > μ - σ) = 0.5928144
cat("P(Z > μ + σ | Z > μ) =", p2, "\n")
## P(Z > μ + σ | Z > μ) = 0.3131313
cat("P(Z > μ + 2σ | Z > μ) =", p3, "\n")
## P(Z > μ + 2σ | Z > μ) = 0.03030303
# Compare with theoretical conditional probabilities for a normal distribution
cat("\nTheoretical Conditional Probabilities for Normal Distribution:\n")
##
## Theoretical Conditional Probabilities for Normal Distribution:
cat("P(Z > μ | Z > μ - σ) =", 0.5/0.8413, "\n")
## P(Z > μ | Z > μ - σ) = 0.5943183
cat("P(Z > μ + σ | Z > μ) =", 0.1587/0.5, "\n")
## P(Z > μ + σ | Z > μ) = 0.3174
cat("P(Z > μ + 2σ | Z > μ) =", 0.0228/0.5, "\n")
## P(Z > μ + 2σ | Z > μ) = 0.0456
Investigate the correlation between Sales and Price. Create a contingency table using quartiles of Sales and Price, and then evaluate the marginal and joint probabilities.
# Create quartiles for Sales and Price
sales_quartiles <- cut(retail_data$Sales, breaks = quantile(retail_data$Sales, probs = seq(0, 1, 0.25)),
labels = c("Q1", "Q2", "Q3", "Q4"), include.lowest = TRUE)
price_quartiles <- cut(retail_data$Price, breaks = quantile(retail_data$Price, probs = seq(0, 1, 0.25)),
labels = c("Q1", "Q2", "Q3", "Q4"), include.lowest = TRUE)
# Create a contingency table
contingency_table <- table(sales_quartiles, price_quartiles)
print("Contingency Table of Sales Quartiles vs Price Quartiles:")
## [1] "Contingency Table of Sales Quartiles vs Price Quartiles:"
print(contingency_table)
## price_quartiles
## sales_quartiles Q1 Q2 Q3 Q4
## Q1 11 16 12 11
## Q2 13 10 15 12
## Q3 15 10 13 12
## Q4 11 14 10 15
# Calculate marginal probabilities
sales_marginal <- prop.table(rowSums(contingency_table))
price_marginal <- prop.table(colSums(contingency_table))
print("Marginal Probabilities for Sales Quartiles:")
## [1] "Marginal Probabilities for Sales Quartiles:"
print(sales_marginal)
## Q1 Q2 Q3 Q4
## 0.25 0.25 0.25 0.25
print("Marginal Probabilities for Price Quartiles:")
## [1] "Marginal Probabilities for Price Quartiles:"
print(price_marginal)
## Q1 Q2 Q3 Q4
## 0.25 0.25 0.25 0.25
# Calculate joint probabilities
joint_probabilities <- prop.table(contingency_table)
print("Joint Probabilities:")
## [1] "Joint Probabilities:"
print(joint_probabilities)
## price_quartiles
## sales_quartiles Q1 Q2 Q3 Q4
## Q1 0.055 0.080 0.060 0.055
## Q2 0.065 0.050 0.075 0.060
## Q3 0.075 0.050 0.065 0.060
## Q4 0.055 0.070 0.050 0.075
Use Fisher’s Exact Test and the Chi-Square Test to check for independence between Sales and Price. Discuss which test is most appropriate and why.
# Fisher's Exact Test for independence
fisher_test <- fisher.test(contingency_table, simulate.p.value = TRUE, B = 10000)
print("Fisher's Exact Test:")
## [1] "Fisher's Exact Test:"
print(fisher_test)
##
## Fisher's Exact Test for Count Data with simulated p-value (based on
## 10000 replicates)
##
## data: contingency_table
## p-value = 0.863
## alternative hypothesis: two.sided
# Chi-Square Test for independence
chi_square_test <- chisq.test(contingency_table)
print("Chi-Square Test:")
## [1] "Chi-Square Test:"
print(chi_square_test)
##
## Pearson's Chi-squared test
##
## data: contingency_table
## X-squared = 4.8, df = 9, p-value = 0.8514
# Calculate Pearson correlation coefficient
correlation <- cor.test(retail_data$Sales, retail_data$Price)
print("Pearson Correlation Test:")
## [1] "Pearson Correlation Test:"
print(correlation)
##
## Pearson's product-moment correlation
##
## data: retail_data$Sales and retail_data$Price
## t = 1.4532, df = 198, p-value = 0.1478
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03653442 0.23807516
## sample estimates:
## cor
## 0.1027273
Context: You are working for a large retail chain that wants to optimize pricing, inventory management, and sales forecasting using data-driven strategies. Your task is to use regression, statistical modeling, and calculus-based methods to make informed decisions.
Generate univariate descriptive statistics for the Inventory_Levels and Sales variables.
# Univariate descriptive statistics for Inventory_Levels and Sales
cat("Descriptive Statistics for Inventory Levels:\n")
## Descriptive Statistics for Inventory Levels:
print(summary(retail_data$Inventory_Levels))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 67.35 376.51 483.72 488.55 600.42 858.79
cat("\nStandard Deviation:", sd(retail_data$Inventory_Levels), "\n")
##
## Standard Deviation: 155.0466
cat("Skewness:", moments::skewness(retail_data$Inventory_Levels), "\n")
## Skewness: 0.09397099
cat("Kurtosis:", moments::kurtosis(retail_data$Inventory_Levels), "\n\n")
## Kurtosis: 2.608498
cat("Descriptive Statistics for Sales:\n")
## Descriptive Statistics for Sales:
print(summary(retail_data$Sales))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.57 284.42 533.54 636.92 867.58 2447.49
cat("\nStandard Deviation:", sd(retail_data$Sales), "\n")
##
## Standard Deviation: 463.4995
cat("Skewness:", moments::skewness(retail_data$Sales), "\n")
## Skewness: 1.226333
cat("Kurtosis:", moments::kurtosis(retail_data$Sales), "\n")
## Kurtosis: 4.574007
Create appropriate visualizations such as histograms and scatterplots for Inventory_Levels, Sales, and Price.
# Visualizations
# Histogram for Inventory_Levels
ggplot(retail_data, aes(x = Inventory_Levels)) +
geom_histogram(bins = 30, fill = "steelblue", color = "black") +
labs(title = "Distribution of Inventory Levels", x = "Inventory Levels", y = "Frequency") +
theme_minimal()
# Histogram for Sales
ggplot(retail_data, aes(x = Sales)) +
geom_histogram(bins = 30, fill = "coral", color = "black") +
labs(title = "Distribution of Sales", x = "Sales", y = "Frequency") +
theme_minimal()
# Histogram for Price
ggplot(retail_data, aes(x = Price)) +
geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
labs(title = "Distribution of Price", x = "Price", y = "Frequency") +
theme_minimal()
# Scatterplot for Sales vs. Inventory_Levels
ggplot(retail_data, aes(x = Sales, y = Inventory_Levels)) +
geom_point(color = "blue", alpha = 0.6) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Sales vs. Inventory Levels", x = "Sales", y = "Inventory Levels") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# Scatterplot for Sales vs. Price
ggplot(retail_data, aes(x = Price, y = Sales)) +
geom_point(color = "darkgreen", alpha = 0.6) +
geom_smooth(method = "lm", color = "red") +
labs(title = "Price vs. Sales", x = "Price", y = "Sales") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Compute a correlation matrix for Sales, Price, and Inventory_Levels.
# Compute correlation matrix
cor_matrix <- cor(retail_data[, c("Sales", "Price", "Inventory_Levels")])
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
## Sales Price Inventory_Levels
## Sales 1.00000000 0.10272730 -0.03529619
## Price 0.10272730 1.00000000 -0.04025941
## Inventory_Levels -0.03529619 -0.04025941 1.00000000
# Visualize correlation matrix
corrplot(cor_matrix, method = "circle", type = "upper",
addCoef.col = "black", tl.col = "black", tl.srt = 45)
Test the hypotheses that the correlations between the variables are zero and provide a 95% confidence interval.
# Sales vs. Price
cor_test_sales_price <- cor.test(retail_data$Sales, retail_data$Price)
print("Correlation Test: Sales vs. Price")
## [1] "Correlation Test: Sales vs. Price"
print(cor_test_sales_price)
##
## Pearson's product-moment correlation
##
## data: retail_data$Sales and retail_data$Price
## t = 1.4532, df = 198, p-value = 0.1478
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.03653442 0.23807516
## sample estimates:
## cor
## 0.1027273
# Sales vs. Inventory_Levels
cor_test_sales_inventory <- cor.test(retail_data$Sales, retail_data$Inventory_Levels)
print("Correlation Test: Sales vs. Inventory_Levels")
## [1] "Correlation Test: Sales vs. Inventory_Levels"
print(cor_test_sales_inventory)
##
## Pearson's product-moment correlation
##
## data: retail_data$Sales and retail_data$Inventory_Levels
## t = -0.49697, df = 198, p-value = 0.6198
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.1731891 0.1039539
## sample estimates:
## cor
## -0.03529619
# Price vs. Inventory_Levels
cor_test_price_inventory <- cor.test(retail_data$Price, retail_data$Inventory_Levels)
print("Correlation Test: Price vs. Inventory_Levels")
## [1] "Correlation Test: Price vs. Inventory_Levels"
print(cor_test_price_inventory)
##
## Pearson's product-moment correlation
##
## data: retail_data$Price and retail_data$Inventory_Levels
## t = -0.56696, df = 198, p-value = 0.5714
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.17800614 0.09903478
## sample estimates:
## cor
## -0.04025941
Explain the meaning of your findings and discuss the implications of the correlations for inventory management. Would you be concerned about multicollinearity in a potential regression model? Why or why not?
Based on the correlation analysis and descriptive statistics, we found several insights relevant to inventory management, some of the most relevant are:
The correlation between Sales and Inventory Levels The analysis shows a very weak and negative metric values (-0.035), (p-value = 0.6198). This suggests that sales volume and inventory levels are managed independently in the dataset. Also, there is no evidence that higher sales correspond to either higher or lower inventory levels, indicating that inventory might be determined by other factors such as lead time or restocking policies rather than being directly tied to sales volume.
Correlation between Sales and Price The correlation is positive but weak (0.103) and not statistically significant (p-value = 0.1478). This is interesting because economic theory often predicts a negative correlation. The weak positive correlation might suggest that higher-priced items in this dataset also happen to be popular items.
Implications for Inventory Management Given these correlations, inventory managers should consider the following recommendations: traditional inventory models may need adjustment; the lack of strong correlations between these key variables suggests that standard inventory models that assume relationships between sales, inventory, and price may not be directly applicable to this retail context.
Inventory decisions appear decoupled from sales patterns: The data suggests inventory decisions may be made using fixed reorder points or time-based replenishment rather than being dynamically adjusted based on sales patterns.
Data-driven inventory optimization opportunity: The weak relationships present an opportunity to implement more sophisticated inventory management strategies that could better align inventory with actual demand patterns.
Multicollinearity Concerns For regression modeling purposes, multicollinearity would not be a concern based on the correlation matrix. All pairwise correlations between Sales, Price, and Inventory_Levels are very low, far from where multicollinearity becomes problematic.
Use linear regression to model the relationship between Sales and Price (assuming Sales as the dependent variable).
# Linear regression model for Sales and Price
price_sales_model <- lm(Sales ~ Price, data = retail_data)
summary(price_sales_model)
##
## Call:
## lm(formula = Sales ~ Price, data = retail_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -679.54 -347.85 -98.63 241.12 1770.08
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 442.951 137.419 3.223 0.00148 **
## Price 9.916 6.824 1.453 0.14775
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 462.2 on 198 degrees of freedom
## Multiple R-squared: 0.01055, Adjusted R-squared: 0.005556
## F-statistic: 2.112 on 1 and 198 DF, p-value: 0.1478
# Extract the regression coefficients
intercept <- coef(price_sales_model)[1]
price_coefficient <- coef(price_sales_model)[2]
# Calculate price elasticity of demand at the mean price
mean_price <- mean(retail_data$Price)
mean_sales <- mean(retail_data$Sales)
price_elasticity <- price_coefficient * (mean_price / mean_sales)
cat("Price Elasticity of Demand at Mean Price:", price_elasticity, "\n")
## Price Elasticity of Demand at Mean Price: 0.3045387
Invert the correlation matrix from your model, and calculate the precision matrix.
# Correlation matrix of variables used in the model
model_vars <- model.matrix(price_sales_model)[, -1] # exclude intercept
cor_matrix <- cor(cbind(model_vars, retail_data$Sales))
print("Correlation Matrix:")
## [1] "Correlation Matrix:"
print(cor_matrix)
## model_vars
## model_vars 1.0000000 0.1027273
## 0.1027273 1.0000000
# Invert correlation matrix to get precision matrix
precision_matrix <- solve(cor_matrix)
print("Precision Matrix:")
## [1] "Precision Matrix:"
print(precision_matrix)
## model_vars
## model_vars 1.0106655 -0.1038229
## -0.1038229 1.0106655
Discuss the implications of the diagonal elements of the precision matrix (which are variance inflation factors).
# Discuss implications of diagonal elements (VIFs)
vif_value <- diag(precision_matrix)
cat("Variance Inflation Factors (Diagonal elements of precision matrix):", vif_value, "\n")
## Variance Inflation Factors (Diagonal elements of precision matrix): 1.010665 1.010665
Perform LU decomposition on the correlation matrix and interpret the results in the context of price elasticity.
# Perform LU decomposition on the correlation matrix
L <- matrix(c(1, cor_matrix[2,1]/cor_matrix[1,1], 0, 1), nrow=2)
U <- matrix(c(cor_matrix[1,1], cor_matrix[1,2],
0, cor_matrix[2,2] - L[2,1]*cor_matrix[1,2]), nrow=2)
print("L Matrix:")
## [1] "L Matrix:"
print(L)
## [,1] [,2]
## [1,] 1.0000000 0
## [2,] 0.1027273 1
print("U Matrix:")
## [1] "U Matrix:"
print(U)
## [,1] [,2]
## [1,] 1.0000000 0.0000000
## [2,] 0.1027273 0.9894471
Identify a variable in the dataset that is skewed to the right (e.g., Sales or Price) and fit an exponential distribution to this data using the fitdistr function.
# Fit an exponential distribution to Sales
sales_exp_fit <- fitdistr(retail_data$Sales, "exponential")
rate_parameter <- sales_exp_fit$estimate["rate"]
# Create a histogram of the original data
hist(retail_data$Sales, freq = FALSE, breaks = 20,
main = "Sales Distribution with Fitted Exponential Curve",
xlab = "Sales", col = "lightblue")
curve(dexp(x, rate = rate_parameter), add = TRUE, col = "red", lwd = 2)
legend("topright", legend = c("Actual Sales Data", "Fitted Exponential Distribution"),
col = c("lightblue", "red"), lwd = c(10, 2))
Generate 1,000 samples from the fitted exponential distribution and compare a histogram of these samples with the original data’s histogram.
# Generate 1,000 samples from the fitted exponential distribution
set.seed(123)
simulated_sales <- rexp(1000, rate = rate_parameter)
# Compare histograms of original and simulated data
par(mfrow = c(1, 2))
hist(retail_data$Sales, main = "Original Sales Data", xlab = "Sales", col = "lightblue")
hist(simulated_sales, main = "Simulated Sales Data (Exponential)", xlab = "Sales", col = "pink")
par(mfrow = c(1, 1))
Calculate the 5th and 95th percentiles using the cumulative distribution function (CDF) of the exponential distribution.
# Calculate the 5th and 95th percentiles using the CDF of exponential distribution
percentile_5th <- qexp(0.05, rate = rate_parameter)
percentile_95th <- qexp(0.95, rate = rate_parameter)
cat("5th Percentile from Exponential Distribution:", percentile_5th, "\n")
## 5th Percentile from Exponential Distribution: 32.66953
cat("95th Percentile from Exponential Distribution:", percentile_95th, "\n")
## 95th Percentile from Exponential Distribution: 1908.03
# Calculate empirical 5th and 95th percentiles from the actual data
empirical_5th <- quantile(retail_data$Sales, 0.05)
empirical_95th <- quantile(retail_data$Sales, 0.95)
cat("Empirical 5th Percentile:", empirical_5th, "\n")
## Empirical 5th Percentile: 104.9028
cat("Empirical 95th Percentile:", empirical_95th, "\n")
## Empirical 95th Percentile: 1502.25
Compute a 95% confidence interval for the original data assuming normality and compare it with the empirical percentiles.
# Compute a 95% confidence interval assuming normality
sales_mean <- mean(retail_data$Sales)
sales_sd <- sd(retail_data$Sales)
n <- length(retail_data$Sales)
margin_error <- qt(0.975, df = n-1) * sales_sd / sqrt(n)
ci_lower <- sales_mean - margin_error
ci_upper <- sales_mean + margin_error
cat("95% Confidence Interval (assuming normality):\n")
## 95% Confidence Interval (assuming normality):
cat("Lower Bound:", ci_lower, "\n")
## Lower Bound: 572.2866
cat("Upper Bound:", ci_upper, "\n")
## Upper Bound: 701.5458
Discuss how well the exponential distribution models the data and what this implies for forecasting future sales or pricing. Consider whether a different distribution might be more appropriate.
Based on the comparison between the exponential distribution model and the actual sales data we can come up with some conclusions:
The exponential distribution appears to be an optimal fit for the sales data, as evidenced by multiple diagnostic indicators, such as: Visualizations, The histogram of the original sales data shows a right-skewed distribution, but with a different shape than the fitted exponential curve. Percentile Comparison, there are significant discrepancies between the theoretical and empirical percentiles. Simulated vs. Actual Data: The histogram comparison between the simulated exponential data and the original sales data reveals different distribution shapes.
The implications for sales forecasting can be determine by the following factors:
Overestimation of Extreme Values: The model would predict more extremely low sales values and more extremely high sales values than what actually occurs in the data.
Inaccurate Risk Assessment: Relying on this model could lead to incorrect inventory planning and resource allocation due to the overestimation of tail probabilities.
Unreliable Confidence Intervals: The 95% confidence interval assuming normality (572.29 to 701.55) is quite narrow compared to the range of the actual data, suggesting that neither the normal nor exponential distributions adequately capture the variability in the sales data.
For this particular analysis, I can suggest the Gamma or Weibull distributions, These models offer more flexibility with two parameters (shape and scale) that can better accommodate the observed sales pattern.
Build a multiple regression model to predict Inventory_Levels based on Sales, Lead_Time_Days, and Price.
# Build multiple regression model
inventory_model <- lm(Inventory_Levels ~ Sales + Lead_Time_Days + Price, data = retail_data)
Provide a full summary of your model, including coefficients, R-squared value, and residual analysis.
# Full summary of the model
summary(inventory_model)
##
## Call:
## lm(formula = Inventory_Levels ~ Sales + Lead_Time_Days + Price,
## data = retail_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -395.54 -118.07 -7.68 111.81 372.56
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 464.792662 61.321852 7.580 1.35e-12 ***
## Sales -0.007809 0.023955 -0.326 0.745
## Lead_Time_Days 7.316793 5.293049 1.382 0.168
## Price -1.087778 2.305846 -0.472 0.638
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 155.3 on 196 degrees of freedom
## Multiple R-squared: 0.01223, Adjusted R-squared: -0.002887
## F-statistic: 0.8091 on 3 and 196 DF, p-value: 0.4902
# Residual analysis
par(mfrow = c(2, 2))
plot(inventory_model)
par(mfrow = c(1, 1))
# Calculate VIF to check for multicollinearity
vif(inventory_model)
## Sales Lead_Time_Days Price
## 1.017562 1.008633 1.011822
# Predict inventory levels for different scenarios
peak_sales <- 1.2 * mean(retail_data$Sales)
mean_lead_time <- mean(retail_data$Lead_Time_Days)
competitive_price <- 0.95 * mean(retail_data$Price)
# create dataframe
peak_scenario <- data.frame(
Sales = peak_sales,
Lead_Time_Days = mean_lead_time,
Price = competitive_price
)
# Predict inventory levels for peak scenario
predicted_inventory <- predict(inventory_model, newdata = peak_scenario, interval = "prediction")
print("Predicted Inventory Levels for Peak Season:")
## [1] "Predicted Inventory Levels for Peak Season:"
print(predicted_inventory)
## fit lwr upr
## 1 488.6163 181.5371 795.6956
Use your model to optimize inventory levels for a peak sales season, balancing minimizing stockouts with minimizing overstock.
# Create a function for optimizing inventory based on the model
stockout_cost_per_unit <- 2
holding_cost_per_unit <- 0.2
inventory_cost <- function(inventory_level, predicted_demand, stockout_cost, holding_cost) {
if (inventory_level < predicted_demand) {
# Stockout cost
return((predicted_demand - inventory_level) * stockout_cost)
} else {
# Holding cost for excess inventory
return((inventory_level - predicted_demand) * holding_cost)
}
}
# Predicted demand during peak season
predicted_demand <- peak_sales
# Search for optimal inventory level
inventory_range <- seq(predicted_demand * 0.8, predicted_demand * 1.2, by = 10)
costs <- sapply(inventory_range, function(inv) {
inventory_cost(inv, predicted_demand, stockout_cost_per_unit, holding_cost_per_unit)
})
optimal_index <- which.min(costs)
optimal_inventory <- inventory_range[optimal_index]
cat("Optimal Inventory Level for Peak Season:", optimal_inventory, "\n")
## Optimal Inventory Level for Peak Season: 771.4396
cat("Associated Cost:", costs[optimal_index], "\n")
## Associated Cost: 1.428022
# Plot the cost function
plot(inventory_range, costs, type = "l", col = "blue",
main = "Inventory Cost Function", xlab = "Inventory Level", ylab = "Cost")
abline(v = optimal_inventory, col = "red", lty = 2)
text(optimal_inventory, max(costs) * 0.9, "Optimal", pos = 4, col = "red")