library(dplyr)
library(tidyverse)
library(wooldridge)
data('discrim')

C8: DISCRIM data

(i) Mean, standard deviation, and units

  • prpblck variable
avg_prpblck <- mean(discrim$prpblck, na.rm = TRUE)
sd_prpblck <- sd(discrim$prpblck, na.rm = TRUE)
sprintf("Mean value: %.2f", avg_prpblck)
## [1] "Mean value: 0.11"
sprintf("Standard deviation value: %.2f", sd_prpblck)
## [1] "Standard deviation value: 0.18"
print("The unit of measurement for the variable is proportion.")
## [1] "The unit of measurement for the variable is proportion."
  • income variable
avg_prpblck <- mean(discrim$income, na.rm = TRUE)
sd_prpblck <- sd(discrim$income, na.rm = TRUE)
sprintf("Mean value: %.2f", avg_prpblck)
## [1] "Mean value: 47053.78"
sprintf("Standard deviation value: %.2f", sd_prpblck)
## [1] "Standard deviation value: 13179.29"
print("The unit of measurement for the variable is in USD.")
## [1] "The unit of measurement for the variable is in USD."

(ii) Run regression and interpret the coefficient

ols <- lm(psoda ~ prpblck + income, data = discrim)
summary(ols)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

The coefficient on prpblck is 1.150e-01, while statistically significant, have a small economical effect.

(iii) Compare two estimations

ols_2 <- lm(psoda ~ prpblck, data = discrim)
summary(ols_2)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

The discrimination effect is larger when we control for income.

(iv) Intepret the result of a regression

ols_3 <- lm(log(psoda) ~ prpblck + log(income), data = discrim)
summary(ols_3)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
incr <- coef(ols_3)["prpblck"] * 0.2 * 100
sprintf("If prpblck increates by .20, the estimated percentage change in psoda is %.2f", incr)
## [1] "If prpblck increates by .20, the estimated percentage change in psoda is 2.43"

(v) Add varible

ols_3_update <- lm(log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
summary(ols_3_update)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08

The value of the estimated coefficient for prpblck decreases and the p-value for the coefficient increases.

(vi) Correlation

subset <- discrim %>% select(income, prppov) %>% na.omit()
log_income <- log(subset$income)
cor(log_income, subset$prppov)
## [1] -0.838467

The correlation is roughly what I expcected. The value of -0.838467 indicates that the poorer the population, the less income family has.

(vii) Evaluate statement

I would not agree with the statement, according to the assumptions of multiple linear regression, as long as the correlation between two independent variables is not 1, we would not have to remove them from the regression.