# Install and load necessary packages
install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
install.packages("car")  # Required for partial regression plots
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
## also installing the dependency 'lme4'
## Warning in install.packages("car"): installation of package 'lme4' had non-zero
## exit status
library(wooldridge)
library(car)
## Loading required package: carData
# Load the DISCRIM dataset from the wooldridge package
data("discrim")
DISCRIM <- discrim

# Check the structure of the dataset to ensure columns are correct
str(DISCRIM)
## 'data.frame':    410 obs. of  37 variables:
##  $ psoda   : num  1.12 1.06 1.06 1.12 1.12 ...
##  $ pfries  : num  1.06 0.91 0.91 1.02 NA ...
##  $ pentree : num  1.02 0.95 0.98 1.06 0.49 ...
##  $ wagest  : num  4.25 4.75 4.25 5 5 ...
##  $ nmgrs   : num  3 3 3 4 3 4 3 3 4 3 ...
##  $ nregs   : int  5 3 5 5 3 4 2 5 4 5 ...
##  $ hrsopen : num  16 16.5 18 16 16 15 16 17 17 18 ...
##  $ emp     : num  27.5 21.5 30 27.5 5 17.5 22.5 18.5 17 27 ...
##  $ psoda2  : num  1.11 1.05 1.05 1.15 1.04 ...
##  $ pfries2 : num  1.11 0.89 0.94 1.05 1.01 ...
##  $ pentree2: num  1.05 0.95 0.98 1.05 0.58 ...
##  $ wagest2 : num  5.05 5.05 5.05 5.05 5.05 ...
##  $ nmgrs2  : num  5 4 4 4 3 3 3 3 4 6 ...
##  $ nregs2  : int  5 3 5 5 3 4 2 5 4 5 ...
##  $ hrsopen2: num  15 17.5 17.5 16 16 15 16 16 18 17 ...
##  $ emp2    : num  27 24.5 25 NA 12 28 18.5 17 34 22 ...
##  $ compown : int  1 0 0 0 0 0 0 1 0 1 ...
##  $ chain   : int  3 1 1 3 1 1 1 3 1 3 ...
##  $ density : num  4030 4030 11400 8345 720 ...
##  $ crmrte  : num  0.0529 0.0529 0.036 0.0484 0.0616 ...
##  $ state   : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ prpblck : num  0.1712 0.1712 0.0474 0.0528 0.0345 ...
##  $ prppov  : num  0.0366 0.0366 0.0879 0.0591 0.0254 ...
##  $ prpncar : num  0.0788 0.0788 0.2694 0.1367 0.0738 ...
##  $ hseval  : num  148300 148300 169200 171600 249100 ...
##  $ nstores : int  3 3 3 3 1 2 1 1 5 5 ...
##  $ income  : num  44534 44534 41164 50366 72287 ...
##  $ county  : int  18 18 12 10 10 18 10 24 10 10 ...
##  $ lpsoda  : num  0.1133 0.0583 0.0583 0.1133 0.1133 ...
##  $ lpfries : num  0.0583 -0.0943 -0.0943 0.0198 NA ...
##  $ lhseval : num  11.9 11.9 12 12.1 12.4 ...
##  $ lincome : num  10.7 10.7 10.6 10.8 11.2 ...
##  $ ldensity: num  8.3 8.3 9.34 9.03 6.58 ...
##  $ NJ      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ BK      : int  0 1 1 0 1 1 1 0 1 0 ...
##  $ KFC     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ RR      : int  1 0 0 1 0 0 0 1 0 1 ...
##  - attr(*, "time.stamp")= chr "25 Jun 2011 23:03"
# 1. Calculate mean and standard deviation of prpblck and income
mean_prpblck <- mean(DISCRIM$prpblck, na.rm = TRUE)
sd_prpblck <- sd(DISCRIM$prpblck, na.rm = TRUE)

mean_income <- mean(DISCRIM$income, na.rm = TRUE)
sd_income <- sd(DISCRIM$income, na.rm = TRUE)

cat("Mean of prpblck:", mean_prpblck, "\n")
## Mean of prpblck: 0.1134864
cat("Standard deviation of prpblck:", sd_prpblck, "\n")
## Standard deviation of prpblck: 0.1824165
cat("Mean of income:", mean_income, "\n")
## Mean of income: 47053.78
cat("Standard deviation of income:", sd_income, "\n")
## Standard deviation of income: 13179.29
# 2. OLS regression model for psoda
ols_model <- lm(psoda ~ prpblck + income, data = DISCRIM)

# Display summary of the OLS model
cat("\nOLS Regression Model Summary:\n")
## 
## OLS Regression Model Summary:
summary(ols_model)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = DISCRIM)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
# 3. Log-linear regression model for psoda
log_model <- lm(log(psoda) ~ prpblck + log(income), data = DISCRIM)

# Display summary of the log model
cat("\nLog-Linear Model Summary:\n")
## 
## Log-Linear Model Summary:
summary(log_model)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = DISCRIM)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
# 4. Coefficient for prpblck from the log model
beta_prpblck <- coef(log_model)["prpblck"]

# Estimated percentage change in psoda for 20 percentage point increase in prpblck
percentage_change <- beta_prpblck * 0.20
cat("\nEstimated percentage change in psoda for a 20% increase in prpblck:", percentage_change, "\n")
## 
## Estimated percentage change in psoda for a 20% increase in prpblck: 0.02431605
# 5. OLS model with prppov (adding proportion of population below poverty line)
extended_model <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = DISCRIM)

# Display summary of the extended model
cat("\nExtended Model with prppov Summary:\n")
## 
## Extended Model with prppov Summary:
summary(extended_model)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = DISCRIM)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
# 6. Compute correlation between log(income) and prppov
correlation <- cor(log(DISCRIM$income), DISCRIM$prppov, use = "complete.obs")
cat("\nCorrelation between log(income) and prppov:", correlation, "\n")
## 
## Correlation between log(income) and prppov: -0.838467
# -----------------------------------------------------------------------
# Option 1: Simple Regression Plot (prpblck only)
# -----------------------------------------------------------------------
# Simple regression model with prpblck only
simple_model <- lm(psoda ~ prpblck, data = DISCRIM)

# Plot the relationship between prpblck and psoda
plot(DISCRIM$prpblck, DISCRIM$psoda, main = "Relationship between prpblck and psoda",
     xlab = "Proportion of Black Population (prpblck)", ylab = "Price of Soda (psoda)")

# Add regression line from the simple model
abline(simple_model, col = "red")

# -----------------------------------------------------------------------
# Option 2: Partial Regression Plots (for prpblck and income)
# -----------------------------------------------------------------------
# Partial regression plots for the multiple regression model
avPlots(ols_model)