#Clearing environment
rm(list = ls())
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(psych)
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

I. What is bias of an estimator?

The bias of an estimator is the discrepancy between its expected value and the true value of the parameter being estimated. In layman’s words, it quantifies how far an estimator deviates from the true value of the parameter it is supposed to estimate.

II. Impact

Omitted variable bias will not be eliminated merely by expanding the sample size or adding more variables. While a greater sample size can result in more exact estimates, it does not address the bias produced by removing an important variable. If we add more factors, including those that account for the influence of the omitted variable, the bias may reduce. However, adding irrelevant variables may result in overfitting without resolving the initial bias. The most effective strategy to address omitted variable bias is to identify and include all relevant variables in your model.

III. Trying OVB

1. Model

df <- mtcars
head(df)
##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
describe(df)
##      vars  n   mean     sd median trimmed    mad   min    max  range  skew
## mpg     1 32  20.09   6.03  19.20   19.70   5.41 10.40  33.90  23.50  0.61
## cyl     2 32   6.19   1.79   6.00    6.23   2.97  4.00   8.00   4.00 -0.17
## disp    3 32 230.72 123.94 196.30  222.52 140.48 71.10 472.00 400.90  0.38
## hp      4 32 146.69  68.56 123.00  141.19  77.10 52.00 335.00 283.00  0.73
## drat    5 32   3.60   0.53   3.70    3.58   0.70  2.76   4.93   2.17  0.27
## wt      6 32   3.22   0.98   3.33    3.15   0.77  1.51   5.42   3.91  0.42
## qsec    7 32  17.85   1.79  17.71   17.83   1.42 14.50  22.90   8.40  0.37
## vs      8 32   0.44   0.50   0.00    0.42   0.00  0.00   1.00   1.00  0.24
## am      9 32   0.41   0.50   0.00    0.38   0.00  0.00   1.00   1.00  0.36
## gear   10 32   3.69   0.74   4.00    3.62   1.48  3.00   5.00   2.00  0.53
## carb   11 32   2.81   1.62   2.00    2.65   1.48  1.00   8.00   7.00  1.05
##      kurtosis    se
## mpg     -0.37  1.07
## cyl     -1.76  0.32
## disp    -1.21 21.91
## hp      -0.14 12.12
## drat    -0.71  0.09
## wt      -0.02  0.17
## qsec     0.34  0.32
## vs      -2.00  0.09
## am      -1.92  0.09
## gear    -1.07  0.13
## carb     1.26  0.29

The equation is:

\[ \text{mpg} = \beta_0 + \beta_1 \cdot \text{cyl} + \beta_2 \cdot \text{disp} + \beta_3 \cdot \text{hp} \]

# Fit the linear regression model
model <- lm(mpg ~ cyl + disp + hp, data = df)

# Display the summary of the model
summary(model)
## 
## Call:
## lm(formula = mpg ~ cyl + disp + hp, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0889 -2.0845 -0.7745  1.3972  6.9183 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.18492    2.59078  13.195 1.54e-13 ***
## cyl         -1.22742    0.79728  -1.540   0.1349    
## disp        -0.01884    0.01040  -1.811   0.0809 .  
## hp          -0.01468    0.01465  -1.002   0.3250    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.055 on 28 degrees of freedom
## Multiple R-squared:  0.7679, Adjusted R-squared:  0.743 
## F-statistic: 30.88 on 3 and 28 DF,  p-value: 5.054e-09

The estimated intercept is 34.18492, meaning that when all independent variables are zero, the predicted mpg would be approximately 34.18. The model suggests a negative relationship between cyl, disp, and hp with mpg. However, only disp is approaching statistical significance, indicating it may have a slight impact on fuel efficiency.

We will be studying the disp as our independent variable.

2. Dropped var by mistake

The modified equation will be:

\[ \text{mpg} = \beta_0' + \beta_1' \cdot \text{disp} + \beta_2' \cdot \text{hp} \]

3. OVB Conditions

cor_test_y <- cor.test(df$cyl, df$mpg)

# Output the results
cor_test_y
## 
##  Pearson's product-moment correlation
## 
## data:  df$cyl and df$mpg
## t = -8.9197, df = 30, p-value = 6.113e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.9257694 -0.7163171
## sample estimates:
##       cor 
## -0.852162
cor_test_x <- cor.test(df$cyl, df$disp)

# Output the results
cor_test_x
## 
##  Pearson's product-moment correlation
## 
## data:  df$cyl and df$disp
## t = 11.445, df = 30, p-value = 1.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8072442 0.9514607
## sample estimates:
##       cor 
## 0.9020329

Condition 1: The omitted variable is correlated with the included independent variable. Here, since cyl is significantly correlated with mpg, the first condition for OVB is satisfied.

Condition 2: The omitted variable is a determinant of the dependent variable. Here, since cyl is significantly correlated with disp, the second condition for OVB is satisfied.

Direction of Bias

Strong negative correlation (correlation coefficient = -0.852162): As cyl increases, mpg decreases.

Strong positive correlation (correlation coefficient = 0.9020329): As cyl increases, disp also increases.

Model comparison

# Incorrect model omitting cyl
incorrect_model <- lm(mpg ~ disp + hp, data = df)
incorrect_model
## 
## Call:
## lm(formula = mpg ~ disp + hp, data = df)
## 
## Coefficients:
## (Intercept)         disp           hp  
##    30.73590     -0.03035     -0.02484
stargazer(model, incorrect_model, type = "text",
          title = "Comparison of Regression Models",
          dep.var.labels = "Miles per Gallon (mpg)",
          column.labels = c("Full Model", "Incorrect Model"),
          covariate.labels = c("Cylinders (cyl)", "Displacement (disp)", "Horsepower (hp)"),
          omit.stat = c("f", "ser"))
## 
## Comparison of Regression Models
## ================================================
##                         Dependent variable:     
##                     ----------------------------
##                        Miles per Gallon (mpg)   
##                      Full Model  Incorrect Model
##                         (1)            (2)      
## ------------------------------------------------
## Cylinders (cyl)        -1.227                   
##                       (0.797)                   
##                                                 
## Displacement (disp)   -0.019*       -0.030***   
##                       (0.010)        (0.007)    
##                                                 
## Horsepower (hp)        -0.015        -0.025*    
##                       (0.015)        (0.013)    
##                                                 
## Constant             34.185***      30.736***   
##                       (2.591)        (1.332)    
##                                                 
## ------------------------------------------------
## Observations             32            32       
## R2                     0.768          0.748     
## Adjusted R2            0.743          0.731     
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
  • In the Correct Model, the coefficient for disp is -0.019, indicating that for every unit increase in disp, the mpg decreases slightly.

  • In the Incorrect Model, the coefficient for disp is -0.030, which is more negative than in the Correct Model.

Intuition for OVB impact

Here, cylinders (cyl) is omitted from the model. Cylinders are likely correlated with both miles per gallon (mpg) and displacement (disp). Generally, cars with more cylinders tend to have larger displacement and lower fuel efficiency (mpg).

Bonus question

1.

cor(df$disp, df$wt)  
## [1] 0.8879799
cor(df$mpg, df$wt)    
## [1] -0.8676594
# Run Full Model
full_model <- lm(mpg ~ disp + wt, data = df)
summary(full_model)
## 
## Call:
## lm(formula = mpg ~ disp + wt, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4087 -2.3243 -0.7683  1.7721  6.3484 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.96055    2.16454  16.151 4.91e-16 ***
## disp        -0.01773    0.00919  -1.929  0.06362 .  
## wt          -3.35082    1.16413  -2.878  0.00743 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.917 on 29 degrees of freedom
## Multiple R-squared:  0.7809, Adjusted R-squared:  0.7658 
## F-statistic: 51.69 on 2 and 29 DF,  p-value: 2.744e-10
# Run Reduced Model
reduced_model <- lm(mpg ~ disp, data = df)
summary(reduced_model)
## 
## Call:
## lm(formula = mpg ~ disp, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.8922 -2.2022 -0.9631  1.6272  7.2305 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 29.599855   1.229720  24.070  < 2e-16 ***
## disp        -0.041215   0.004712  -8.747 9.38e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.251 on 30 degrees of freedom
## Multiple R-squared:  0.7183, Adjusted R-squared:  0.709 
## F-statistic: 76.51 on 1 and 30 DF,  p-value: 9.38e-10
# Compare coefficients
coef_full <- coef(full_model)
coef_reduced <- coef(reduced_model)

coef_full["disp"]
##        disp 
## -0.01772474
coef_reduced["disp"]
##        disp 
## -0.04121512

There is no big difference.

2.

model1 <- lm(mpg ~ disp, data = df)

# Second Model: mpg on disp and hp
model2 <- lm(mpg ~ disp + hp, data = df)

# Summarize the models side by side
stargazer(model1, model2, type = "text",
          title = "Comparison of Regression Models",
          dep.var.labels = "Miles per Gallon (mpg)",
          cov.labels = c("Displacement (disp)", "Horsepower (hp)"),
          omit.stat = c("f", "ser"),
          single.row = TRUE)
## 
## Comparison of Regression Models
## ================================================
##                      Dependent variable:        
##              -----------------------------------
##                    Miles per Gallon (mpg)       
##                     (1)               (2)       
## ------------------------------------------------
## disp         -0.041*** (0.005) -0.030*** (0.007)
## hp                              -0.025* (0.013) 
## Constant     29.600*** (1.230) 30.736*** (1.332)
## ------------------------------------------------
## Observations        32                32        
## R2                 0.718             0.748      
## Adjusted R2        0.709             0.731      
## ================================================
## Note:                *p<0.1; **p<0.05; ***p<0.01
## 
## Comparison of Regression Models
## ===================================
## Displacement (disp) Horsepower (hp)
## -----------------------------------
cor(df$hp, df$disp)
## [1] 0.7909486

No significant difference noticed.