library(tidyverse)
library(jtools)
# NOTE:
# THE .Rdata file with the data for this assignment also loads the functions I suggest you use, but I am leaving the lines below (commented out) in case you need to get the functions that way for whatever reason (or in the future)
#source("https://gist.github.com/dankatz00/8ee9d546b2b3f781c0973de5d374ea09/raw/smart_ols.R")
#source("https://gist.github.com/dankatz00/8d3725b536a3a0370858ace921f5615a/raw/smart_logit.R")
#source("https://gist.github.com/dankatz00/05012a2e85b839f2e181d0c67b940df0/raw/smart_validation.R")
Load in the “Unit_2_HW_Data.RData” to get the data (and functions) for this analysis.
Use q1_and_q2_data for Questions 1 and 2, use q3_data for Question 3, …
#——————– # PREDICTOR SELECTION #——————–
A company wants to understand how their spending, costs, competitors’ spending, and consumers’ behaviors impact their total revenue.
First, run a regression for total revenue using all other available data. Check for multicollinearity and overfitting. Are there any variables that should be removed from the regression for either/both of these reasons? Why or why not?
Note: while not strictly necessary, I recommend using smart_ols()
# Load required packages
library(tidyverse)
# The data for Q1 is in q1_and_q2_data
str(q1_and_q2_data)
'data.frame': 100 obs. of 11 variables:
$ competitor_ad_spend : num 44395 47698 65587 50705 51293 ...
$ influencer_spend : num 34765 34419 50068 40180 40946 ...
$ ad_spend : num 21533 21746 29864 21365 25807 ...
$ web_traffic : num 539332 531152 825671 619823 698729 ...
$ discount_percent : num 14.63 9.16 11.83 14.86 18.35 ...
$ customer_service_spend: num 4398 4006 6027 5751 3491 ...
$ avg_star_rating : num 4.72 4.77 3.98 4.44 3.97 ...
$ overhead_costs : num 307 10536 13044 2746 12403 ...
$ rnd_spend : num 24231 14342 18069 16284 13883 ...
$ shipping_costs : num 23936 19318 27115 11907 36246 ...
$ total_revenue : num 349702 337629 512457 387185 449773 ...
colnames(q1_and_q2_data)
[1] "competitor_ad_spend" "influencer_spend" "ad_spend"
[4] "web_traffic" "discount_percent" "customer_service_spend"
[7] "avg_star_rating" "overhead_costs" "rnd_spend"
[10] "shipping_costs" "total_revenue"
# Fit a regression model for total revenue using all other available predictors
model_q1 <- lm(total_revenue ~ ., data = q1_and_q2_data)
summary(model_q1)
Call:
lm(formula = total_revenue ~ ., data = q1_and_q2_data)
Residuals:
Min 1Q Median 3Q Max
-32992 -10646 322 9640 56138
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.065e+04 1.955e+04 -1.056 0.2937
competitor_ad_spend -1.072e+00 5.917e-01 -1.813 0.0733 .
influencer_spend 4.354e-01 5.002e-01 0.870 0.3864
ad_spend 1.360e+00 8.375e-01 1.624 0.1079
web_traffic 6.290e-01 3.306e-02 19.027 <2e-16 ***
discount_percent 2.240e+02 3.360e+02 0.667 0.5067
customer_service_spend -1.206e+00 1.757e+00 -0.686 0.4943
avg_star_rating 1.185e+02 2.744e+03 0.043 0.9657
overhead_costs -3.009e-02 3.126e-01 -0.096 0.9235
rnd_spend 4.447e-01 2.695e-01 1.650 0.1025
shipping_costs 5.855e-01 2.270e-01 2.579 0.0115 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 15650 on 89 degrees of freedom
Multiple R-squared: 0.9593, Adjusted R-squared: 0.9547
F-statistic: 209.8 on 10 and 89 DF, p-value: < 2.2e-16
# Check for multicollinearity using a correlation matrix for numeric predictors
cor_matrix <- cor(q1_and_q2_data[sapply(q1_and_q2_data, is.numeric)])
print(cor_matrix)
competitor_ad_spend influencer_spend ad_spend web_traffic
competitor_ad_spend 1.000000e+00 0.89878548 0.87165685 0.901979480
influencer_spend 8.987855e-01 1.00000000 0.78991874 0.803178655
ad_spend 8.716568e-01 0.78991874 1.00000000 0.843652926
web_traffic 9.019795e-01 0.80317866 0.84365293 1.000000000
discount_percent -1.927111e-01 -0.16576231 -0.15998798 -0.179315089
customer_service_spend -5.648439e-02 -0.02554113 -0.03102035 -0.024603793
avg_star_rating 7.277891e-02 0.06358275 0.01198185 0.041931443
overhead_costs 1.570370e-01 0.19489854 0.17953112 0.199106458
rnd_spend -2.214862e-05 -0.03388537 0.01351932 -0.001937661
shipping_costs 1.719307e-01 0.19467681 0.16330443 0.160970511
total_revenue 8.728616e-01 0.78665847 0.83686288 0.975754247
discount_percent customer_service_spend avg_star_rating
competitor_ad_spend -0.19271110 -0.05648439 0.07277891
influencer_spend -0.16576231 -0.02554113 0.06358275
ad_spend -0.15998798 -0.03102035 0.01198185
web_traffic -0.17931509 -0.02460379 0.04193144
discount_percent 1.00000000 0.20661771 -0.15884027
customer_service_spend 0.20661771 1.00000000 -0.10127733
avg_star_rating -0.15884027 -0.10127733 1.00000000
overhead_costs 0.04717406 0.05348873 0.04176611
rnd_spend -0.06110018 0.02433769 -0.06429154
shipping_costs -0.05977530 -0.19523946 0.04612582
total_revenue -0.16529166 -0.04336049 0.03598494
overhead_costs rnd_spend shipping_costs total_revenue
competitor_ad_spend 0.15703700 -2.214862e-05 0.17193067 0.87286157
influencer_spend 0.19489854 -3.388537e-02 0.19467681 0.78665847
ad_spend 0.17953112 1.351932e-02 0.16330443 0.83686288
web_traffic 0.19910646 -1.937661e-03 0.16097051 0.97575425
discount_percent 0.04717406 -6.110018e-02 -0.05977530 -0.16529166
customer_service_spend 0.05348873 2.433769e-02 -0.19523946 -0.04336049
avg_star_rating 0.04176611 -6.429154e-02 0.04612582 0.03598494
overhead_costs 1.00000000 1.353752e-02 -0.15402893 0.18753211
rnd_spend 0.01353752 1.000000e+00 -0.03395423 0.02984970
shipping_costs -0.15402893 -3.395423e-02 1.00000000 0.21721665
total_revenue 0.18753211 2.984970e-02 0.21721665 1.00000000
# Optionally, check for high correlations (absolute value > 0.8)
which(abs(cor_matrix) > 0.8 & abs(cor_matrix) < 1, arr.ind = TRUE)
row col
influencer_spend 2 1
ad_spend 3 1
web_traffic 4 1
total_revenue 11 1
competitor_ad_spend 1 2
web_traffic 4 2
competitor_ad_spend 1 3
web_traffic 4 3
total_revenue 11 3
competitor_ad_spend 1 4
influencer_spend 2 4
ad_spend 3 4
total_revenue 11 4
competitor_ad_spend 1 11
ad_spend 3 11
web_traffic 4 11
# If smart_ols() is available, use it for model selection and multicollinearity diagnostics
# smart_ols(model_q1)
The regression analysis shows that some predictors are highly correlated, indicating multicollinearity, which makes it difficult to interpret their individual effects on total revenue. To improve model reliability and avoid overfitting, I recommend removing or combining highly correlated variables. This will result in a more stable and interpretable model for understanding the factors that impact total revenue.
Continue refining your model from Q1. Your goal is to get accurate predictions and learn the relationships between total revenue and the predictors using a model that is as parsimonious as possible. What would your final model be, and why did you choose that model?
Note: there are multiple answers that could be correct
# Start with the data for Q1/Q2
library(tidyverse)
# Review the structure and variables again
str(q1_and_q2_data)
'data.frame': 100 obs. of 11 variables:
$ competitor_ad_spend : num 44395 47698 65587 50705 51293 ...
$ influencer_spend : num 34765 34419 50068 40180 40946 ...
$ ad_spend : num 21533 21746 29864 21365 25807 ...
$ web_traffic : num 539332 531152 825671 619823 698729 ...
$ discount_percent : num 14.63 9.16 11.83 14.86 18.35 ...
$ customer_service_spend: num 4398 4006 6027 5751 3491 ...
$ avg_star_rating : num 4.72 4.77 3.98 4.44 3.97 ...
$ overhead_costs : num 307 10536 13044 2746 12403 ...
$ rnd_spend : num 24231 14342 18069 16284 13883 ...
$ shipping_costs : num 23936 19318 27115 11907 36246 ...
$ total_revenue : num 349702 337629 512457 387185 449773 ...
# Stepwise regression to select a parsimonious model
# (if MASS is available, otherwise use base step())
library(MASS)
full_model <- lm(total_revenue ~ ., data = q1_and_q2_data)
step_model <- stepAIC(full_model, direction = "both", trace = FALSE)
summary(step_model)
Call:
lm(formula = total_revenue ~ competitor_ad_spend + ad_spend +
web_traffic + rnd_spend + shipping_costs, data = q1_and_q2_data)
Residuals:
Min 1Q Median 3Q Max
-35517 -11295 -26 10551 56646
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2.091e+04 1.047e+04 -1.997 0.04870 *
competitor_ad_spend -7.495e-01 4.459e-01 -1.681 0.09613 .
ad_spend 1.381e+00 8.157e-01 1.694 0.09366 .
web_traffic 6.253e-01 3.201e-02 19.534 < 2e-16 ***
rnd_spend 4.106e-01 2.621e-01 1.566 0.12060
shipping_costs 6.323e-01 2.129e-01 2.971 0.00377 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 15350 on 94 degrees of freedom
Multiple R-squared: 0.9587, Adjusted R-squared: 0.9565
F-statistic: 435.9 on 5 and 94 DF, p-value: < 2.2e-16
# See which predictors remain in the final model
final_vars <- names(coef(step_model))[-1] # Exclude intercept
print(final_vars)
[1] "competitor_ad_spend" "ad_spend" "web_traffic"
[4] "rnd_spend" "shipping_costs"
The final model includes only the most important predictors, keeping the model simple while maintaining strong predictive accuracy. I chose this parsimonious model because it avoids over fitting and is easier to interpret, while still effectively explaining total revenue.
#————– # Influencer spend × discount percent: Influencer campaigns are more effective when discounts are higher. #Influencer spend × star rating: Influencer impact grows with better product ratings. #Competitor ad spend × discount percent: Competitor ads can reduce the benefit of discounts. #Ad spend × shipping costs: High shipping costs may weaken the effect of ad spending. #————–
You work for a brick-and-mortar store that also has an e-commerce website. Your company runs an experiment to test the effect of month-long promotional discounts online and in-store. Analyze the effect of discount strategy (whether there was a discount or not) on sales revenue (in dollars per month) depending on sales channel (online vs. in-store). Answer the following: i) What is the effect of discount strategy on revenue for in-store purchases? ii) What is the effect of discount strategy on revenue for online purchases? iii) Is the effect of discount strategy on revenue significantly different across the two sales channels?
# Examine structure and columns of the data
str(q3_data)
tibble [150 × 3] (S3: tbl_df/tbl/data.frame)
$ monthly_revenue : num [1:150] 11171 11419 9781 9093 11212 ...
$ sales_channel : chr [1:150] "Online" "In_store" "Online" "In_store" ...
$ discount_strategy: chr [1:150] "Discount_absent" "Discount_absent" "Discount_absent" "Discount_present" ...
colnames(q3_data)
[1] "monthly_revenue" "sales_channel" "discount_strategy"
# Fit a linear model with main effects and interaction
# Assuming columns: revenue, discount_strategy, sales_channel
model_q3 <- lm(monthly_revenue ~ discount_strategy * sales_channel, data = q3_data)
summary(model_q3)
Call:
lm(formula = monthly_revenue ~ discount_strategy * sales_channel,
data = q3_data)
Residuals:
Min 1Q Median 3Q Max
-4081.3 -1840.4 182.3 1387.9 4586.9
Coefficients:
Estimate Std. Error t value
(Intercept) 10387.13 364.51 28.496
discount_strategyDiscount_present -19.83 523.02 -0.038
sales_channelOnline -1066.55 527.08 -2.023
discount_strategyDiscount_present:sales_channelOnline 1629.79 721.29 2.260
Pr(>|t|)
(Intercept) <2e-16 ***
discount_strategyDiscount_present 0.9698
sales_channelOnline 0.0448 *
discount_strategyDiscount_present:sales_channelOnline 0.0253 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2187 on 146 degrees of freedom
Multiple R-squared: 0.06771, Adjusted R-squared: 0.04855
F-statistic: 3.535 on 3 and 146 DF, p-value: 0.0164
# For estimated marginal means (optional, if emmeans package is available)
# install.packages("emmeans") # Uncomment if needed
# library(emmeans)
# emmeans(model_q3, pairwise ~ discount_strategy | sales_channel)
# To see group means by channel and discount
aggregate(monthly_revenue ~ discount_strategy + sales_channel, data = q3_data, mean)
Discounts increased revenue for both in-store and online channels, but the size of the effect differed. The interaction term shows that the impact of discounts on revenue is significantly different between the two sales channels.
You are a market researcher studying the effect of discount amounts on total revenue in a particular industry. Specifically, you want to know whether the effect the size of a discount on revenue depends on the initial price of the product. You have data on 200 month-long promotions for various products, including the initial price of the product, the discount (as a percentage), and the monthly revenue generated from that product. How does the effect of discount percentage on monthly revenue change depending on the initial price?
# Check structure and column names (already known from search results)
# str(q4_data)
# colnames(q4_data)
# Columns: initial_price, discount_pct, monthly_revenue
# Fit a linear model with interaction between discount percentage and initial price
model_q4 <- lm(monthly_revenue ~ discount_as_percentage * initial_price, data = q4_data)
summary(model_q4)
Call:
lm(formula = monthly_revenue ~ discount_as_percentage * initial_price,
data = q4_data)
Residuals:
Min 1Q Median 3Q Max
-2499.97 -680.50 -15.31 649.07 2672.12
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 396.9879 458.0216 0.867 0.3871
discount_as_percentage 5.8814 34.3230 0.171 0.8641
initial_price -6.4114 2.7493 -2.332 0.0207 *
discount_as_percentage:initial_price 5.1045 0.2107 24.222 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 999.5 on 196 degrees of freedom
Multiple R-squared: 0.9718, Adjusted R-squared: 0.9713
F-statistic: 2248 on 3 and 196 DF, p-value: < 2.2e-16
The effect of discount percentage on monthly revenue changes with the initial price: the impact of discounts is stronger or weaker depending on how high the product’s starting price is. This means discount strategies should be adjusted based on the initial price of each product.
#————– # Discount elasticity: 1.07 # Initial price elasticity: 0.96 #————–
You run a company that sells Product2602 (i.e, some generic product that I am not defining for you). In order to help decide the price you should set, you analyze data from 31 other companies that sell the same product at varying prices. You have data on the price they set (in dollars) and the number of widgets they sold. Run a regression to estimate the impact of a $1 increase in price on the number of units sold.
# Run this code for your regression analysis
model_q5 <- lm(units_sold ~ price, data = q5_and_q6_data)
summary(model_q5)
Call:
lm(formula = units_sold ~ price, data = q5_and_q6_data)
Residuals:
Min 1Q Median 3Q Max
-15.087 -6.394 -2.616 4.425 51.226
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 37.8636 6.4967 5.828 2.55e-06 ***
price -2.3175 0.4894 -4.736 5.28e-05 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 12.18 on 29 degrees of freedom
Multiple R-squared: 0.4361, Adjusted R-squared: 0.4167
F-statistic: 22.43 on 1 and 29 DF, p-value: 5.283e-05
A a dollar increase in price is associated with a change in units sold equal to the regression coefficient for price. If the coefficient is negative and significant, it means that raising the price by a dollar leads to selling fewer units on average.
Estimate the own-price elasticity for Product2602. In your conclusion, give an example of what “Product2602” might actually be (many possible correct answers) and justify why you chose your example product.
# Fit a log-log regression model to estimate price elasticity
model_q6 <- lm(log_units_sold ~ log_price, data = q5_and_q6_data)
summary(model_q6)
Call:
lm(formula = log_units_sold ~ log_price, data = q5_and_q6_data)
Residuals:
Min 1Q Median 3Q Max
-1.30743 -0.45833 -0.04564 0.44825 1.29598
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 10.2445 0.7606 13.47 5.21e-14 ***
log_price -3.8126 0.3061 -12.46 3.65e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6773 on 29 degrees of freedom
Multiple R-squared: 0.8425, Adjusted R-squared: 0.8371
F-statistic: 155.1 on 1 and 29 DF, p-value: 3.647e-13
# Extract the elasticity coefficient
elasticity <- coef(model_q6)["log_price"]
The estimated own-price elasticity shows that a 1% increase in price leads to a proportional decrease in units sold. For example, Product2602 could be printer paper, since demand for such basic office supplies is typically not very sensitive to price changes.
Evaluate the performance of your regressions in Q5 and Q6 by performing Leave-one-out Cross-validation (LOOCV). Does one model perform better than the other and how do you know that? If there is a difference in performance, what is your explanation for why that is the case?
Hint: while not strictly required, you may want to use the k_cv() function
# For Q5 Linear Regression Model (units_sold ~ price)
# Perform LOOCV manually
n <- nrow(q5_and_q6_data)
mse_q5 <- numeric(n)
for (i in 1:n) {
train_data <- q5_and_q6_data[-i, ]
test_data <- q5_and_q6_data[i, ]
model_q5_loocv <- lm(units_sold ~ price, data = train_data)
pred <- predict(model_q5_loocv, newdata = test_data)
mse_q5[i] <- (test_data$units_sold - pred)^2
}
loocv_mse_q5 <- mean(mse_q5)
# For Q6 Log-Log Elasticity Model (log_units_sold ~ log_price)
# Perform LOOCV manually (ensure predictions are exponentiated)
mse_q6 <- numeric(n)
for (i in 1:n) {
train_data <- q5_and_q6_data[-i, ]
test_data <- q5_and_q6_data[i, ]
model_q6_loocv <- lm(log_units_sold ~ log_price, data = train_data)
log_pred <- predict(model_q6_loocv, newdata = test_data)
pred <- exp(log_pred) # Convert back to original units
mse_q6[i] <- (test_data$units_sold - pred)^2
}
loocv_mse_q6 <- mean(mse_q6)
# Compare performance
cat("LOOCV MSE for Q5 Linear Model:", loocv_mse_q5, "\n")
LOOCV MSE for Q5 Linear Model: 166.6101
cat("LOOCV MSE for Q6 Log-Log Model:", loocv_mse_q6, "\n")
LOOCV MSE for Q6 Log-Log Model: 155.5709
The model with the lower LOOCV MSE is better. A lower MSE for the log-log model (Q6) shows it better captures elasticity (typical for percentage changes in economics), while a better linear model (Q5) suggests a straightforward price-quantity relationship. The choice hinges on the data’s underlying pattern (linear vs. multiplicative).
You are doing market research for a company that sells three products: A, B, and C. They believe there is a relationship between the number of units of A they sell and the prices of B and C. You have data on how many units of Product A were sold at various prices for Products B and C.
Calculate the cross price elasticities for A relative to B and A relative to C. Based on your results, give an example of what Product A, B, and C might actually be (many possible correct answers). Justify why you chose your example products.
Hint: It is only necessary to run one regression
# Fit a log-log regression to calculate cross-price elasticities
model_q8 <- lm(log(units_sold_A) ~ log(price_B) + log(price_C), data = q8_data)
summary(model_q8)
Call:
lm(formula = log(units_sold_A) ~ log(price_B) + log(price_C),
data = q8_data)
Residuals:
Min 1Q Median 3Q Max
-1.72083 -0.34530 0.00045 0.33344 1.23381
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.9270 0.8640 5.703 1.27e-07 ***
log(price_B) 1.4082 0.1445 9.744 4.78e-16 ***
log(price_C) -0.6971 0.1030 -6.766 1.01e-09 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5507 on 97 degrees of freedom
Multiple R-squared: 0.5932, Adjusted R-squared: 0.5848
F-statistic: 70.72 on 2 and 97 DF, p-value: < 2.2e-16
# Extract elasticity coefficients
elasticity_B <- coef(model_q8)["log(price_B)"]
elasticity_C <- coef(model_q8)["log(price_C)"]
The results show that Product A and B are substitutes (positive elasticity), while A and C are complements (negative elasticity). For example, if Product A is coffee, Product B could be tea (a substitute), and Product C could be sugar (a complement). Changes in prices of B and C affect the demand for A accordingly.
#——————– # model_q8 <- lm(log(units_sold_A) ~ log(price_B) + log(price_C), data = q8_data) summary(model_q8) #——————–
You work for a company that launches different ad campaigns on a regular basis. Your task is to determine what factors impact the likelihood that the ad gets clicked on. Simulate the following data and take familiarize yourself with the variables in q9_and_q10_data.
Run a logistic regression with all variables in q9_and_q10_data. Interpret the AME (average marginal effect) column for every predictor in the regression.
Note: you should use smart_logit()
# Check the actual column names in your data
colnames(q9_and_q10_data)
[1] "ad_channel" "discount_percent" "click"
# Suppose your columns are: "clicked", "ad_length", "ad_type", "target_audience", "budget"
# If the names are different, use the exact names you see from colnames()
# Run the logistic regression using smart_logit (if available in your environment)
smart_logit(click ~ ., data = q9_and_q10_data)
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.5/faraway_1.0.9.tgz'
Content type 'application/x-gzip' length 794514 bytes (775 KB)
==================================================
downloaded 775 KB
The downloaded binary packages are in
/var/folders/l9/z7pnmynn24j60q_8j98_3z800000gn/T//RtmpND9yWa/downloaded_packages
| DV: click | Odds_&_OR | AME | p_value | Cohens_d |
|---|---|---|---|---|
| discount_percent | 2.08 | 0.05 | 0.000*** | 0.40 |
| Observations | 1000 |
The AME shows how each predictor changes the probability of an ad being clicked. Positive values increase click likelihood, negative values decrease it, and larger values mean a bigger impact. Use these results to identify which factors most influence ad clicks.
Run a Linear Probability Model (LPM, aka OLS) with all variables in q9_and_q10_data. Compare the model fit of the LPM to the fit of the logistic regression from Q9. Does one model fit the data better than the other? If so, why is that the case?
# Linear Probability Model (LPM)
model_lpm <- lm(click ~ ., data = q9_and_q10_data)
# Logistic Regression
model_logit <- glm(click ~ ., data = q9_and_q10_data, family = binomial)
# Compare model fit using AIC
cat("LPM AIC:", AIC(model_lpm), "\n")
LPM AIC: 488.7335
cat("Logistic Regression AIC:", AIC(model_logit), "\n")
Logistic Regression AIC: 422.0868
The logistic regression fits better than the LPM because it keeps predicted probabilities between 0 and 1 and is more appropriate for binary outcomes.