# Install and load necessary packages
install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("car") # Required for partial regression plots
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependency 'lme4'
## Warning in install.packages("car"): installation of package 'lme4' had non-zero
## exit status
library(wooldridge)
library(car)
## Loading required package: carData
# Load the DISCRIM dataset from the wooldridge package
data("discrim")
DISCRIM <- discrim
# Check the structure of the dataset to ensure columns are correct
str(DISCRIM)
## 'data.frame': 410 obs. of 37 variables:
## $ psoda : num 1.12 1.06 1.06 1.12 1.12 ...
## $ pfries : num 1.06 0.91 0.91 1.02 NA ...
## $ pentree : num 1.02 0.95 0.98 1.06 0.49 ...
## $ wagest : num 4.25 4.75 4.25 5 5 ...
## $ nmgrs : num 3 3 3 4 3 4 3 3 4 3 ...
## $ nregs : int 5 3 5 5 3 4 2 5 4 5 ...
## $ hrsopen : num 16 16.5 18 16 16 15 16 17 17 18 ...
## $ emp : num 27.5 21.5 30 27.5 5 17.5 22.5 18.5 17 27 ...
## $ psoda2 : num 1.11 1.05 1.05 1.15 1.04 ...
## $ pfries2 : num 1.11 0.89 0.94 1.05 1.01 ...
## $ pentree2: num 1.05 0.95 0.98 1.05 0.58 ...
## $ wagest2 : num 5.05 5.05 5.05 5.05 5.05 ...
## $ nmgrs2 : num 5 4 4 4 3 3 3 3 4 6 ...
## $ nregs2 : int 5 3 5 5 3 4 2 5 4 5 ...
## $ hrsopen2: num 15 17.5 17.5 16 16 15 16 16 18 17 ...
## $ emp2 : num 27 24.5 25 NA 12 28 18.5 17 34 22 ...
## $ compown : int 1 0 0 0 0 0 0 1 0 1 ...
## $ chain : int 3 1 1 3 1 1 1 3 1 3 ...
## $ density : num 4030 4030 11400 8345 720 ...
## $ crmrte : num 0.0529 0.0529 0.036 0.0484 0.0616 ...
## $ state : int 1 1 1 1 1 1 1 1 1 1 ...
## $ prpblck : num 0.1712 0.1712 0.0474 0.0528 0.0345 ...
## $ prppov : num 0.0366 0.0366 0.0879 0.0591 0.0254 ...
## $ prpncar : num 0.0788 0.0788 0.2694 0.1367 0.0738 ...
## $ hseval : num 148300 148300 169200 171600 249100 ...
## $ nstores : int 3 3 3 3 1 2 1 1 5 5 ...
## $ income : num 44534 44534 41164 50366 72287 ...
## $ county : int 18 18 12 10 10 18 10 24 10 10 ...
## $ lpsoda : num 0.1133 0.0583 0.0583 0.1133 0.1133 ...
## $ lpfries : num 0.0583 -0.0943 -0.0943 0.0198 NA ...
## $ lhseval : num 11.9 11.9 12 12.1 12.4 ...
## $ lincome : num 10.7 10.7 10.6 10.8 11.2 ...
## $ ldensity: num 8.3 8.3 9.34 9.03 6.58 ...
## $ NJ : int 1 1 1 1 1 1 1 1 1 1 ...
## $ BK : int 0 1 1 0 1 1 1 0 1 0 ...
## $ KFC : int 0 0 0 0 0 0 0 0 0 0 ...
## $ RR : int 1 0 0 1 0 0 0 1 0 1 ...
## - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
# 1. Calculate mean and standard deviation of prpblck and income
mean_prpblck <- mean(DISCRIM$prpblck, na.rm = TRUE)
sd_prpblck <- sd(DISCRIM$prpblck, na.rm = TRUE)
mean_income <- mean(DISCRIM$income, na.rm = TRUE)
sd_income <- sd(DISCRIM$income, na.rm = TRUE)
cat("Mean of prpblck:", mean_prpblck, "\n")
## Mean of prpblck: 0.1134864
cat("Standard deviation of prpblck:", sd_prpblck, "\n")
## Standard deviation of prpblck: 0.1824165
cat("Mean of income:", mean_income, "\n")
## Mean of income: 47053.78
cat("Standard deviation of income:", sd_income, "\n")
## Standard deviation of income: 13179.29
# 2. OLS regression model for psoda
ols_model <- lm(psoda ~ prpblck + income, data = DISCRIM)
# Display summary of the OLS model
cat("\nOLS Regression Model Summary:\n")
##
## OLS Regression Model Summary:
summary(ols_model)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = DISCRIM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
# 3. Log-linear regression model for psoda
log_model <- lm(log(psoda) ~ prpblck + log(income), data = DISCRIM)
# Display summary of the log model
cat("\nLog-Linear Model Summary:\n")
##
## Log-Linear Model Summary:
summary(log_model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = DISCRIM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
# 4. Coefficient for prpblck from the log model
beta_prpblck <- coef(log_model)["prpblck"]
# Estimated percentage change in psoda for 20 percentage point increase in prpblck
percentage_change <- beta_prpblck * 0.20
cat("\nEstimated percentage change in psoda for a 20% increase in prpblck:", percentage_change, "\n")
##
## Estimated percentage change in psoda for a 20% increase in prpblck: 0.02431605
# 5. OLS model with prppov (adding proportion of population below poverty line)
extended_model <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = DISCRIM)
# Display summary of the extended model
cat("\nExtended Model with prppov Summary:\n")
##
## Extended Model with prppov Summary:
summary(extended_model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = DISCRIM)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
# 6. Compute correlation between log(income) and prppov
correlation <- cor(log(DISCRIM$income), DISCRIM$prppov, use = "complete.obs")
cat("\nCorrelation between log(income) and prppov:", correlation, "\n")
##
## Correlation between log(income) and prppov: -0.838467
# -----------------------------------------------------------------------
# Option 1: Simple Regression Plot (prpblck only)
# -----------------------------------------------------------------------
# Simple regression model with prpblck only
simple_model <- lm(psoda ~ prpblck, data = DISCRIM)
# Plot the relationship between prpblck and psoda
plot(DISCRIM$prpblck, DISCRIM$psoda, main = "Relationship between prpblck and psoda",
xlab = "Proportion of Black Population (prpblck)", ylab = "Price of Soda (psoda)")
# Add regression line from the simple model
abline(simple_model, col = "red")

# -----------------------------------------------------------------------
# Option 2: Partial Regression Plots (for prpblck and income)
# -----------------------------------------------------------------------
# Partial regression plots for the multiple regression model
avPlots(ols_model)
