library(dplyr)
library(tidyverse)
library(wooldridge)
data('discrim')
DISCRIM
dataprpblck
variableavg_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
sprintf("Mean value: %.2f", avg_prpblck)
## [1] "Mean value: 0.11"
sprintf("Standard deviation value: %.2f", sd_prpblck)
## [1] "Standard deviation value: 0.18"
print("The unit of measurement for the variable is proportion.")
## [1] "The unit of measurement for the variable is proportion."
income
variableavg_prpblck <- mean(discrim$income, na.rm = TRUE)
sd_prpblck <- sd(discrim$income, na.rm = TRUE)
sprintf("Mean value: %.2f", avg_prpblck)
## [1] "Mean value: 47053.78"
sprintf("Standard deviation value: %.2f", sd_prpblck)
## [1] "Standard deviation value: 13179.29"
print("The unit of measurement for the variable is in USD.")
## [1] "The unit of measurement for the variable is in USD."
ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(ols)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
The coefficient on prpblck is 1.150e-01, while statistically significant, have a small economical effect.
ols_2 <- lm(psoda ~ prpblck, data = discrim)
summary(ols_2)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
The discrimination effect is larger when we control for income.
ols_3 <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(ols_3)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
incr <- coef(ols_3)["prpblck"] * 0.2 * 100
sprintf("If prpblck increates by .20, the estimated percentage change in psoda is %.2f", incr)
## [1] "If prpblck increates by .20, the estimated percentage change in psoda is 2.43"
ols_3_update <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(ols_3_update)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
The value of the estimated coefficient for prpblck decreases and the p-value for the coefficient increases.
subset <- discrim %>% select(income, prppov) %>% na.omit()
log_income <- log(subset$income)
cor(log_income, subset$prppov)
## [1] -0.838467
The correlation is roughly what I expcected. The value of -0.838467 indicates that the poorer the population, the less income family has.
I would not agree with the statement, according to the assumptions of multiple linear regression, as long as the correlation between two independent variables is not 1, we would not have to remove them from the regression.