install.packages("wooldridge")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(wooldridge)
data("discrim")
#(i) Find the average values and standard deviations of prpblck and income.
# Mean and standard deviation of prpblck and income
mean_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
mean_income <- mean(discrim$income, na.rm = TRUE)
sd_income <- sd(discrim$income, na.rm = TRUE)
cat("Average prpblck: ", mean_prpblck, ", SD: ", sd_prpblck, "\n")
## Average prpblck: 0.1134864 , SD: 0.1824165
cat("Average income: ", mean_income, ", SD: ", sd_income, "\n")
## Average income: 47053.78 , SD: 13179.29
##(ii) Estimate the OLS model psoda = β₀ + β₁prpblck + β₂income + u.
# OLS regression of psoda on prpblck and income
model_ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(model_ols)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
# Interpret the coefficient on prpblck
coef_prpblck <- coef(model_ols)["prpblck"]
cat("Coefficient on prpblck: ", coef_prpblck, "\n")
## Coefficient on prpblck: 0.1149882
##(iii) Compare the estimate from part (ii) with the simple regression psoda ~ prpblck.
# Simple regression of psoda on prpblck
model_simple <- lm(psoda ~ prpblck, data = discrim)
summary(model_simple)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
# Compare the prpblck coefficients from both models
cat("Coefficient from simple regression: ", coef(model_simple)["prpblck"], "\n")
## Coefficient from simple regression: 0.06492687
cat("Coefficient from multiple regression: ", coef(model_ols)["prpblck"], "\n")
## Coefficient from multiple regression: 0.1149882
##(iv) Estimate the log-linear model log(psoda) = β₀ + β₁prpblck + β₂log(income) + u.
# Log-linear regression of log(psoda) on prpblck and log(income)
model_log <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(model_log)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
# Coefficient on prpblck from log-linear model
beta_prpblck_log <- coef(model_log)["prpblck"]
# Calculate the percentage change for a 0.20 increase in prpblck
percentage_change_psoda <- beta_prpblck_log * 0.20 * 100
cat("Estimated percentage change in psoda: ", percentage_change_psoda, "%\n")
## Estimated percentage change in psoda: 2.431605 %
##(v) Add the variable prppov to the regression model.
# Log-linear regression with prppov added
model_log_prppov <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(model_log_prppov)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
# Check the new coefficient of prpblck
cat("New coefficient of prpblck: ", coef(model_log_prppov)["prpblck"], "\n")
## New coefficient of prpblck: 0.07280726
##(vi) Find the correlation between log(income) and prppov.
# Correlation between log(income) and prppov
correlation_log_income_prppov <- cor(log(discrim$income), discrim$prppov, use = "complete.obs")
cat("Correlation between log(income) and prppov: ", correlation_log_income_prppov, "\n")
## Correlation between log(income) and prppov: -0.838467
##(vii) Evaluate the statement about multicollinearity.
#Based on the correlation between log(income) and prppov, evaluate whether they should both be included in the model. If the correlation is very high (near 1 or -1), it might indicate multicollinearity, which can make the estimates less reliable. Therefore, the statement may have merit.