install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(wooldridge)
data("discrim")

#(i) Find the average values and standard deviations of prpblck and income.

# Mean and standard deviation of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)

mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)

cat("Average prpblck: ", mean_prpblck, ", SD: ", sd_prpblck, "\n")
## Average prpblck:  0.1134864 , SD:  0.1824165
cat("Average income: ", mean_income, ", SD: ", sd_income, "\n")
## Average income:  47053.78 , SD:  13179.29

##(ii) Estimate the OLS model psoda = β₀ + β₁prpblck + β₂income + u.

# OLS regression of psoda on prpblck and income
model_ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_ols)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
# Interpret the coefficient on prpblck
coef_prpblck <- coef(model_ols)["prpblck"]
cat("Coefficient on prpblck: ", coef_prpblck, "\n")
## Coefficient on prpblck:  0.1149882

##(iii) Compare the estimate from part (ii) with the simple regression psoda ~ prpblck.

# Simple regression of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015
# Compare the prpblck coefficients from both models
cat("Coefficient from simple regression: ", coef(model_simple)["prpblck"], "\n")
## Coefficient from simple regression:  0.06492687
cat("Coefficient from multiple regression: ", coef(model_ols)["prpblck"], "\n")
## Coefficient from multiple regression:  0.1149882

##(iv) Estimate the log-linear model log(psoda) = β₀ + β₁prpblck + β₂log(income) + u.

# Log-linear regression of log(psoda) on prpblck and log(income)
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
# Coefficient on prpblck from log-linear model
beta_prpblck_log <- coef(model_log)["prpblck"]

# Calculate the percentage change for a 0.20 increase in prpblck
percentage_change_psoda <- beta_prpblck_log * 0.20 * 100
cat("Estimated percentage change in psoda: ", percentage_change_psoda, "%\n")
## Estimated percentage change in psoda:  2.431605 %

##(v) Add the variable prppov to the regression model.

# Log-linear regression with prppov added
model_log_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_log_prppov)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
# Check the new coefficient of prpblck
cat("New coefficient of prpblck: ", coef(model_log_prppov)["prpblck"], "\n")
## New coefficient of prpblck:  0.07280726

##(vi) Find the correlation between log(income) and prppov.

# Correlation between log(income) and prppov
correlation_log_income_prppov <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("Correlation between log(income) and prppov: ", correlation_log_income_prppov, "\n")
## Correlation between log(income) and prppov:  -0.838467

##(vii) Evaluate the statement about multicollinearity.

#Based on the correlation between log(income) and prppov, evaluate whether they should both be included in the model. If the correlation is very high (near 1 or -1), it might indicate multicollinearity, which can make the estimates less reliable. Therefore, the statement may have merit.