library(car)
library(lattice)
correlation_heat_map <- function(cormat) {
vars <- rownames(cormat)
grid <- expand.grid(Var1 = vars, Var2 = vars)
grid$value <- as.vector(cormat)
print(
levelplot(
value ~ Var1 * Var2,
data = grid,
xlab = "", ylab = "",
col.regions = colorRampPalette(c("blue", "white", "red"))(100),
at = seq(-1, 1, length.out = 101),
colorkey = list(space = "right"),
scales = list(x = list(rot = 45)),
main = "Correlation Heat Map"
)
)
}
restdata <- read.csv("studenmunds_restaurants.csv", header = TRUE)
print(str(restdata))
## 'data.frame': 33 obs. of 4 variables:
## $ sales : int 107919 118866 98579 122015 152827 91259 123550 160931 98496 108052 ...
## $ competition: int 3 5 7 2 3 5 8 2 6 2 ...
## $ population : int 65044 101376 124989 55249 73775 48484 138809 50244 104300 37852 ...
## $ income : int 13240 22554 16916 20967 19576 15039 21857 26435 24024 14987 ...
## NULL
print(restdata)
## sales competition population income
## 1 107919 3 65044 13240
## 2 118866 5 101376 22554
## 3 98579 7 124989 16916
## 4 122015 2 55249 20967
## 5 152827 3 73775 19576
## 6 91259 5 48484 15039
## 7 123550 8 138809 21857
## 8 160931 2 50244 26435
## 9 98496 6 104300 24024
## 10 108052 2 37852 14987
## 11 144788 3 66921 30902
## 12 164571 4 166332 31573
## 13 105564 3 61951 19001
## 14 102568 5 100441 20058
## 15 103342 2 39462 16194
## 16 127030 5 139900 21384
## 17 166755 6 171740 18800
## 18 125343 6 149894 15289
## 19 121886 3 57386 16702
## 20 134594 6 185105 19093
## 21 152937 3 114520 26502
## 22 109622 3 52933 18760
## 23 149884 5 203500 33242
## 24 98388 4 39334 14988
## 25 140791 3 95120 18505
## 26 101260 3 49200 16839
## 27 139517 4 113566 28915
## 28 115236 9 194125 19033
## 29 136749 7 233844 19200
## 30 105067 7 83416 22833
## 31 136872 6 183953 14409
## 32 117146 3 60457 20307
## 33 163538 2 65065 20111
print(summary(restdata))
## sales competition population income
## Min. : 91259 Min. :2.000 Min. : 37852 Min. :13240
## 1st Qu.:105564 1st Qu.:3.000 1st Qu.: 57386 1st Qu.:16839
## Median :122015 Median :4.000 Median : 95120 Median :19200
## Mean :125635 Mean :4.394 Mean :103887 Mean :20553
## 3rd Qu.:140791 3rd Qu.:6.000 3rd Qu.:139900 3rd Qu.:22554
## Max. :166755 Max. :9.000 Max. :233844 Max. :33242
with(restdata, hist(
sales / 1000,
xlab = "Sales (thousands)",
ylab = "Frequency",
main = "Distribution of Restaurant Sales",
las = 1,
col = "steelblue",
border = "white"
))
with(restdata, plot(
sort(sales / 1000),
(1:length(sales)) / length(sales),
type = "s", ylim = c(0, 1), las = 1,
xlab = "Restaurants Ordered by Sales (thousands)",
ylab = "Proportion of Restaurants with Lower Sales",
main = "CDF of Restaurant Sales",
col = "steelblue"
))
pairs(
restdata,
panel = function(x, y) {
points(x, y, pch = 19, cex = 0.7, col = "gray40")
abline(lm(y ~ x), lty = "solid", col = "red")
lines(lowess(x, y), col = "blue")
},
main = "Scatter Plot Matrix - Studenmund's Restaurants"
)
restdata_cormat <- cor(restdata[, c("sales", "competition", "population", "income")])
print(round(restdata_cormat, 3))
## sales competition population income
## sales 1.000 -0.144 0.393 0.537
## competition -0.144 1.000 0.726 -0.032
## population 0.393 0.726 1.000 0.245
## income 0.537 -0.032 0.245 1.000
correlation_heat_map(cormat = restdata_cormat)
restdata_model <- sales ~ competition + population + income
restdata_fit <- lm(restdata_model, data = restdata)
print(summary(restdata_fit))
##
## Call:
## lm(formula = restdata_model, data = restdata)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21923 -8627 -2956 5328 33887
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.022e+05 1.280e+04 7.984 8.35e-09 ***
## competition -9.075e+03 2.053e+03 -4.421 0.000126 ***
## population 3.547e-01 7.268e-02 4.880 3.54e-05 ***
## income 1.288e+00 5.433e-01 2.371 0.024623 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14540 on 29 degrees of freedom
## Multiple R-squared: 0.6182, Adjusted R-squared: 0.5787
## F-statistic: 15.65 on 3 and 29 DF, p-value: 3.058e-06
print(vif(restdata_fit))
## competition population income
## 2.348446 2.496186 1.180772
par(mfrow = c(2, 2), mar = c(4, 4, 2, 2))
plot(restdata_fit)
par(mfrow = c(1, 1))
sites <- data.frame(
sales = c(NA, NA, NA),
competition = c(2, 3, 5),
population = c(50000, 200000, 220000),
income = c(25000, 22000, 19000)
)
sites$predicted_sales <- round(predict(restdata_fit, newdata = sites), 0)
print(sites)
## sales competition population income predicted_sales
## 1 NA 2 50000 25000 133975
## 2 NA 3 200000 22000 174236
## 3 NA 5 220000 19000 159317
bar_positions <- barplot(
sites$predicted_sales,
names.arg = c("Site 1", "Site 2", "Site 3"),
main = "Predicted Sales by Site",
ylab = "Predicted Sales ($)",
col = c("steelblue", "darkgreen", "orange"),
border = "white",
ylim = c(0, max(sites$predicted_sales) * 1.15)
)
text(
x = bar_positions,
y = sites$predicted_sales,
labels = paste0("$", format(sites$predicted_sales, big.mark = ",")),
pos = 3,
cex = 0.9,
col = "black"
)
Suggestions: Explore alternative regression methods (ridge, lasso, elastic net) and compare with OLS results. Evaluate out-of-sample predictive power using cross-validation. For a time series extension, consider the Lydia Pinkham or Wisconsin Lottery datasets.