Restaurant Site Selection (R)

library(car)
library(lattice)

correlation_heat_map <- function(cormat) {
  vars <- rownames(cormat)
  grid <- expand.grid(Var1 = vars, Var2 = vars)
  grid$value <- as.vector(cormat)
  print(
    levelplot(
      value ~ Var1 * Var2,
      data = grid,
      xlab = "", ylab = "",
      col.regions = colorRampPalette(c("blue", "white", "red"))(100),
      at = seq(-1, 1, length.out = 101),
      colorkey = list(space = "right"),
      scales = list(x = list(rot = 45)),
      main = "Correlation Heat Map"
    )
  )
}
restdata <- read.csv("studenmunds_restaurants.csv", header = TRUE)
print(str(restdata))
## 'data.frame':    33 obs. of  4 variables:
##  $ sales      : int  107919 118866 98579 122015 152827 91259 123550 160931 98496 108052 ...
##  $ competition: int  3 5 7 2 3 5 8 2 6 2 ...
##  $ population : int  65044 101376 124989 55249 73775 48484 138809 50244 104300 37852 ...
##  $ income     : int  13240 22554 16916 20967 19576 15039 21857 26435 24024 14987 ...
## NULL
print(restdata)
##     sales competition population income
## 1  107919           3      65044  13240
## 2  118866           5     101376  22554
## 3   98579           7     124989  16916
## 4  122015           2      55249  20967
## 5  152827           3      73775  19576
## 6   91259           5      48484  15039
## 7  123550           8     138809  21857
## 8  160931           2      50244  26435
## 9   98496           6     104300  24024
## 10 108052           2      37852  14987
## 11 144788           3      66921  30902
## 12 164571           4     166332  31573
## 13 105564           3      61951  19001
## 14 102568           5     100441  20058
## 15 103342           2      39462  16194
## 16 127030           5     139900  21384
## 17 166755           6     171740  18800
## 18 125343           6     149894  15289
## 19 121886           3      57386  16702
## 20 134594           6     185105  19093
## 21 152937           3     114520  26502
## 22 109622           3      52933  18760
## 23 149884           5     203500  33242
## 24  98388           4      39334  14988
## 25 140791           3      95120  18505
## 26 101260           3      49200  16839
## 27 139517           4     113566  28915
## 28 115236           9     194125  19033
## 29 136749           7     233844  19200
## 30 105067           7      83416  22833
## 31 136872           6     183953  14409
## 32 117146           3      60457  20307
## 33 163538           2      65065  20111
print(summary(restdata))
##      sales         competition      population         income     
##  Min.   : 91259   Min.   :2.000   Min.   : 37852   Min.   :13240  
##  1st Qu.:105564   1st Qu.:3.000   1st Qu.: 57386   1st Qu.:16839  
##  Median :122015   Median :4.000   Median : 95120   Median :19200  
##  Mean   :125635   Mean   :4.394   Mean   :103887   Mean   :20553  
##  3rd Qu.:140791   3rd Qu.:6.000   3rd Qu.:139900   3rd Qu.:22554  
##  Max.   :166755   Max.   :9.000   Max.   :233844   Max.   :33242
with(restdata, hist(
  sales / 1000,
  xlab = "Sales (thousands)",
  ylab = "Frequency",
  main = "Distribution of Restaurant Sales",
  las = 1,
  col = "steelblue",
  border = "white"
))

with(restdata, plot(
  sort(sales / 1000),
  (1:length(sales)) / length(sales),
  type = "s", ylim = c(0, 1), las = 1,
  xlab = "Restaurants Ordered by Sales (thousands)",
  ylab = "Proportion of Restaurants with Lower Sales",
  main = "CDF of Restaurant Sales",
  col = "steelblue"
))

pairs(
  restdata,
  panel = function(x, y) {
    points(x, y, pch = 19, cex = 0.7, col = "gray40")
    abline(lm(y ~ x), lty = "solid", col = "red")
    lines(lowess(x, y), col = "blue")
  },
  main = "Scatter Plot Matrix - Studenmund's Restaurants"
)

restdata_cormat <- cor(restdata[, c("sales", "competition", "population", "income")])
print(round(restdata_cormat, 3))
##              sales competition population income
## sales        1.000      -0.144      0.393  0.537
## competition -0.144       1.000      0.726 -0.032
## population   0.393       0.726      1.000  0.245
## income       0.537      -0.032      0.245  1.000
correlation_heat_map(cormat = restdata_cormat)

restdata_model <- sales ~ competition + population + income
restdata_fit   <- lm(restdata_model, data = restdata)
print(summary(restdata_fit))
## 
## Call:
## lm(formula = restdata_model, data = restdata)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -21923  -8627  -2956   5328  33887 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.022e+05  1.280e+04   7.984 8.35e-09 ***
## competition -9.075e+03  2.053e+03  -4.421 0.000126 ***
## population   3.547e-01  7.268e-02   4.880 3.54e-05 ***
## income       1.288e+00  5.433e-01   2.371 0.024623 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14540 on 29 degrees of freedom
## Multiple R-squared:  0.6182, Adjusted R-squared:  0.5787 
## F-statistic: 15.65 on 3 and 29 DF,  p-value: 3.058e-06
print(vif(restdata_fit))
## competition  population      income 
##    2.348446    2.496186    1.180772
par(mfrow = c(2, 2), mar = c(4, 4, 2, 2))
plot(restdata_fit)

par(mfrow = c(1, 1))
sites <- data.frame(
  sales       = c(NA, NA, NA),
  competition = c(2,     3,      5),
  population  = c(50000, 200000, 220000),
  income      = c(25000, 22000,  19000)
)
sites$predicted_sales <- round(predict(restdata_fit, newdata = sites), 0)
print(sites)
##   sales competition population income predicted_sales
## 1    NA           2      50000  25000          133975
## 2    NA           3     200000  22000          174236
## 3    NA           5     220000  19000          159317
bar_positions <- barplot(
  sites$predicted_sales,
  names.arg = c("Site 1", "Site 2", "Site 3"),
  main = "Predicted Sales by Site",
  ylab = "Predicted Sales ($)",
  col = c("steelblue", "darkgreen", "orange"),
  border = "white",
  ylim = c(0, max(sites$predicted_sales) * 1.15)
)

text(
  x      = bar_positions,
  y      = sites$predicted_sales,
  labels = paste0("$", format(sites$predicted_sales, big.mark = ",")),
  pos    = 3,
  cex    = 0.9,
  col    = "black"
)


Suggestions: Explore alternative regression methods (ridge, lasso, elastic net) and compare with OLS results. Evaluate out-of-sample predictive power using cross-validation. For a time series extension, consider the Lydia Pinkham or Wisconsin Lottery datasets.