# Load necessary libraries
library(wooldridge)  # For the DISCRIM dataset
library(dplyr)       # For data manipulation
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)     # For data visualization
library(sandwich)    # For robust standard errors
library(lmtest)      # For hypothesis tests and model comparisons
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# Load the DISCRIM dataset from the wooldridge package
data <- wooldridge::discrim

# Inspect the first few rows of the data to understand its structure
head(data)
##   psoda pfries pentree wagest nmgrs nregs hrsopen  emp psoda2 pfries2 pentree2
## 1  1.12   1.06    1.02   4.25     3     5    16.0 27.5   1.11    1.11     1.05
## 2  1.06   0.91    0.95   4.75     3     3    16.5 21.5   1.05    0.89     0.95
## 3  1.06   0.91    0.98   4.25     3     5    18.0 30.0   1.05    0.94     0.98
## 4  1.12   1.02    1.06   5.00     4     5    16.0 27.5   1.15    1.05     1.05
## 5  1.12     NA    0.49   5.00     3     3    16.0  5.0   1.04    1.01     0.58
## 6  1.06   0.95    1.01   4.25     4     4    15.0 17.5   1.05    0.94     1.00
##   wagest2 nmgrs2 nregs2 hrsopen2 emp2 compown chain density    crmrte state
## 1    5.05      5      5     15.0 27.0       1     3    4030 0.0528866     1
## 2    5.05      4      3     17.5 24.5       0     1    4030 0.0528866     1
## 3    5.05      4      5     17.5 25.0       0     1   11400 0.0360003     1
## 4    5.05      4      5     16.0   NA       0     3    8345 0.0484232     1
## 5    5.05      3      3     16.0 12.0       0     1     720 0.0615890     1
## 6    5.05      3      4     15.0 28.0       0     1    4424 0.0334823     1
##     prpblck    prppov   prpncar hseval nstores income county     lpsoda
## 1 0.1711542 0.0365789 0.0788428 148300       3  44534     18 0.11332869
## 2 0.1711542 0.0365789 0.0788428 148300       3  44534     18 0.05826885
## 3 0.0473602 0.0879072 0.2694298 169200       3  41164     12 0.05826885
## 4 0.0528394 0.0591227 0.1366903 171600       3  50366     10 0.11332869
## 5 0.0344800 0.0254145 0.0738020 249100       1  72287     10 0.11332869
## 6 0.0591327 0.0835001 0.1151341 148000       2  44515     18 0.05826885
##       lpfries  lhseval  lincome ldensity NJ BK KFC RR
## 1  0.05826885 11.90699 10.70401 8.301521  1  0   0  1
## 2 -0.09431065 11.90699 10.70401 8.301521  1  1   0  0
## 3 -0.09431065 12.03884 10.62532 9.341369  1  1   0  0
## 4  0.01980261 12.05292 10.82707 9.029418  1  0   0  1
## 5          NA 12.42561 11.18840 6.579251  1  1   0  0
## 6 -0.05129331 11.90497 10.70358 8.394799  1  1   0  0
# (i) Calculate the average values and standard deviations for prpblck and income
avg_prpblck <- mean(data$prpblck, na.rm = TRUE)
sd_prpblck <- sd(data$prpblck, na.rm = TRUE)
avg_income <- mean(data$income, na.rm = TRUE)
sd_income <- sd(data$income, na.rm = TRUE)

cat("Average prpblck:", round(avg_prpblck, 3), "\n")
## Average prpblck: 0.113
cat("Standard deviation of prpblck:", round(sd_prpblck, 3), "\n")
## Standard deviation of prpblck: 0.182
cat("Average income:", round(avg_income, 3), "\n")
## Average income: 47053.79
cat("Standard deviation of income:", round(sd_income, 3), "\n")
## Standard deviation of income: 13179.29
# Summary statistics table for the report
summary_stats <- data.frame(
  Variable = c("prpblck", "income"),
  Mean = c(avg_prpblck, avg_income),
  Standard_Deviation = c(sd_prpblck, sd_income)
)
print(summary_stats)
##   Variable         Mean Standard_Deviation
## 1  prpblck 1.134864e-01       1.824165e-01
## 2   income 4.705378e+04       1.317929e+04
# (ii) OLS Model: psoda = β0 + β1*prpblck + β2*income + u
model_1 <- lm(psoda ~ prpblck + income, data = data)
summary(model_1)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
# Print regression results in a clean table
cat("\nModel 1: Regression Results (psoda ~ prpblck + income)\n")
## 
## Model 1: Regression Results (psoda ~ prpblck + income)
print(coef(summary(model_1)))
##                 Estimate   Std. Error   t value      Pr(>|t|)
## (Intercept) 9.563196e-01 1.899201e-02 50.353788 9.999987e-175
## prpblck     1.149882e-01 2.600064e-02  4.422515  1.260520e-05
## income      1.602674e-06 3.617667e-07  4.430130  1.218837e-05
# Calculate robust standard errors to check for heteroskedasticity
robust_se_model_1 <- coeftest(model_1, vcov = vcovHC(model_1, type = "HC1"))
cat("\nRobust Standard Errors (to check for heteroskedasticity)\n")
## 
## Robust Standard Errors (to check for heteroskedasticity)
print(robust_se_model_1)
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 9.5632e-01 1.9097e-02 50.0762 < 2.2e-16 ***
## prpblck     1.1499e-01 2.8790e-02  3.9941 7.736e-05 ***
## income      1.6027e-06 3.6181e-07  4.4296 1.222e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# (iii) Simple regression of psoda on prpblck
model_2 <- lm(psoda ~ prpblck, data = data)
summary(model_2)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015
# Compare coefficients between the two models
cat("\nComparison of Coefficients:\n")
## 
## Comparison of Coefficients:
cat("Coefficient on prpblck in Model 1 (with income):", round(coef(model_1)["prpblck"], 3), "\n")
## Coefficient on prpblck in Model 1 (with income): 0.115
cat("Coefficient on prpblck in Simple Regression:", round(coef(model_2)["prpblck"], 3), "\n")
## Coefficient on prpblck in Simple Regression: 0.065
# Visualizing the effect of prpblck with and without income
ggplot(data, aes(x = prpblck, y = psoda)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", col = "blue", se = FALSE) +
  ggtitle("Effect of prpblck on psoda (Simple Regression)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

# (iv) Log-linear model: log(psoda) = β0 + β1*prpblck + β2*log(income) + u
model_3 <- lm(log(psoda) ~ prpblck + log(income), data = data)
summary(model_3)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
# Compare R-squared between linear and log-linear models
cat("\nModel Comparison (R-squared):\n")
## 
## Model Comparison (R-squared):
cat("R-squared (Linear Model):", summary(model_1)$r.squared, "\n")
## R-squared (Linear Model): 0.06422039
cat("R-squared (Log-Linear Model):", summary(model_3)$r.squared, "\n")
## R-squared (Log-Linear Model): 0.06809228
# Visualize log-linear model with residuals
ggplot(data, aes(x = log(income), y = log(psoda))) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", col = "red", se = FALSE) +
  ggtitle("Log-Linear Model: log(psoda) ~ prpblck + log(income)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 9 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 9 rows containing missing values or values outside the scale range
## (`geom_point()`).

# (v) Adding prppov to the log-linear model
model_4 <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = data)
summary(model_4)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
# Compare the effect of prpblck across different models
cat("\nCoefficient on prpblck across models:\n")
## 
## Coefficient on prpblck across models:
cat("Model 1 (Linear, with income):", round(coef(model_1)["prpblck"], 3), "\n")
## Model 1 (Linear, with income): 0.115
cat("Model 3 (Log-Linear):", round(coef(model_3)["prpblck"], 3), "\n")
## Model 3 (Log-Linear): 0.122
cat("Model 4 (Log-Linear, with prppov):", round(coef(model_4)["prpblck"], 3), "\n")
## Model 4 (Log-Linear, with prppov): 0.073
# (vi) Correlation between log(income) and prppov
correlation <- cor(log(data$income), data$prppov, use = "complete.obs")
cat("\nCorrelation between log(income) and prppov:", round(correlation, 3), "\n")
## 
## Correlation between log(income) and prppov: -0.838
# (vii) Evaluating multicollinearity
if (abs(correlation) > 0.8) {
  cat("\nWarning: High correlation between log(income) and prppov (Multicollinearity Risk)\n")
} else {
  cat("\nCorrelation is moderate. Multicollinearity is not a major concern.\n")
}
## 
## Warning: High correlation between log(income) and prppov (Multicollinearity Risk)
# Additional: Diagnostic plots for Model 1
par(mfrow = c(2, 2))  # Show 4 diagnostic plots at once
plot(model_1)

# Additional: Hypothesis testing for the effect of prpblck
cat("\nHypothesis Test for prpblck in Model 1\n")
## 
## Hypothesis Test for prpblck in Model 1
# Null hypothesis: β1 (prpblck coefficient) = 0
# Alternative hypothesis: β1 ≠ 0
hyp_test_prpblck <- coeftest(model_1, vcov = vcovHC(model_1, type = "HC1"))
print(hyp_test_prpblck)
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 9.5632e-01 1.9097e-02 50.0762 < 2.2e-16 ***
## prpblck     1.1499e-01 2.8790e-02  3.9941 7.736e-05 ***
## income      1.6027e-06 3.6181e-07  4.4296 1.222e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1