#Clearing environment
rm(list = ls())
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(psych)
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
The bias of an estimator is the discrepancy between its expected value and the true value of the parameter being estimated. In layman’s words, it quantifies how far an estimator deviates from the true value of the parameter it is supposed to estimate.
Omitted variable bias will not be eliminated merely by expanding the sample size or adding more variables. While a greater sample size can result in more exact estimates, it does not address the bias produced by removing an important variable. If we add more factors, including those that account for the influence of the omitted variable, the bias may reduce. However, adding irrelevant variables may result in overfitting without resolving the initial bias. The most effective strategy to address omitted variable bias is to identify and include all relevant variables in your model.
df <- mtcars
head(df)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
describe(df)
## vars n mean sd median trimmed mad min max range skew
## mpg 1 32 20.09 6.03 19.20 19.70 5.41 10.40 33.90 23.50 0.61
## cyl 2 32 6.19 1.79 6.00 6.23 2.97 4.00 8.00 4.00 -0.17
## disp 3 32 230.72 123.94 196.30 222.52 140.48 71.10 472.00 400.90 0.38
## hp 4 32 146.69 68.56 123.00 141.19 77.10 52.00 335.00 283.00 0.73
## drat 5 32 3.60 0.53 3.70 3.58 0.70 2.76 4.93 2.17 0.27
## wt 6 32 3.22 0.98 3.33 3.15 0.77 1.51 5.42 3.91 0.42
## qsec 7 32 17.85 1.79 17.71 17.83 1.42 14.50 22.90 8.40 0.37
## vs 8 32 0.44 0.50 0.00 0.42 0.00 0.00 1.00 1.00 0.24
## am 9 32 0.41 0.50 0.00 0.38 0.00 0.00 1.00 1.00 0.36
## gear 10 32 3.69 0.74 4.00 3.62 1.48 3.00 5.00 2.00 0.53
## carb 11 32 2.81 1.62 2.00 2.65 1.48 1.00 8.00 7.00 1.05
## kurtosis se
## mpg -0.37 1.07
## cyl -1.76 0.32
## disp -1.21 21.91
## hp -0.14 12.12
## drat -0.71 0.09
## wt -0.02 0.17
## qsec 0.34 0.32
## vs -2.00 0.09
## am -1.92 0.09
## gear -1.07 0.13
## carb 1.26 0.29
The equation is:
\[ \text{mpg} = \beta_0 + \beta_1 \cdot \text{cyl} + \beta_2 \cdot \text{disp} + \beta_3 \cdot \text{hp} \]
# Fit the linear regression model
model <- lm(mpg ~ cyl + disp + hp, data = df)
# Display the summary of the model
summary(model)
##
## Call:
## lm(formula = mpg ~ cyl + disp + hp, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0889 -2.0845 -0.7745 1.3972 6.9183
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.18492 2.59078 13.195 1.54e-13 ***
## cyl -1.22742 0.79728 -1.540 0.1349
## disp -0.01884 0.01040 -1.811 0.0809 .
## hp -0.01468 0.01465 -1.002 0.3250
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.055 on 28 degrees of freedom
## Multiple R-squared: 0.7679, Adjusted R-squared: 0.743
## F-statistic: 30.88 on 3 and 28 DF, p-value: 5.054e-09
The estimated intercept is 34.18492, meaning that when all
independent variables are zero, the predicted mpg
would be
approximately 34.18. The model suggests a negative relationship between
cyl
, disp
, and hp
with
mpg
. However, only disp
is approaching
statistical significance, indicating it may have a slight impact on fuel
efficiency.
We will be studying the disp
as our independent
variable.
The modified equation will be:
\[ \text{mpg} = \beta_0' + \beta_1' \cdot \text{disp} + \beta_2' \cdot \text{hp} \]
cor_test_y <- cor.test(df$cyl, df$mpg)
# Output the results
cor_test_y
##
## Pearson's product-moment correlation
##
## data: df$cyl and df$mpg
## t = -8.9197, df = 30, p-value = 6.113e-10
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.9257694 -0.7163171
## sample estimates:
## cor
## -0.852162
cor_test_x <- cor.test(df$cyl, df$disp)
# Output the results
cor_test_x
##
## Pearson's product-moment correlation
##
## data: df$cyl and df$disp
## t = 11.445, df = 30, p-value = 1.803e-12
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8072442 0.9514607
## sample estimates:
## cor
## 0.9020329
Condition 1: The omitted variable is correlated with
the included independent variable. Here, since cyl
is
significantly correlated with mpg
, the first condition for
OVB is satisfied.
Condition 2: The omitted variable is a determinant
of the dependent variable. Here, since cyl
is significantly
correlated with disp
, the second condition for OVB is
satisfied.
Strong negative correlation (correlation coefficient = -0.852162): As
cyl
increases, mpg
decreases.
Strong positive correlation (correlation coefficient = 0.9020329): As
cyl
increases, disp
also increases.
# Incorrect model omitting cyl
incorrect_model <- lm(mpg ~ disp + hp, data = df)
incorrect_model
##
## Call:
## lm(formula = mpg ~ disp + hp, data = df)
##
## Coefficients:
## (Intercept) disp hp
## 30.73590 -0.03035 -0.02484
stargazer(model, incorrect_model, type = "text",
title = "Comparison of Regression Models",
dep.var.labels = "Miles per Gallon (mpg)",
column.labels = c("Full Model", "Incorrect Model"),
covariate.labels = c("Cylinders (cyl)", "Displacement (disp)", "Horsepower (hp)"),
omit.stat = c("f", "ser"))
##
## Comparison of Regression Models
## ================================================
## Dependent variable:
## ----------------------------
## Miles per Gallon (mpg)
## Full Model Incorrect Model
## (1) (2)
## ------------------------------------------------
## Cylinders (cyl) -1.227
## (0.797)
##
## Displacement (disp) -0.019* -0.030***
## (0.010) (0.007)
##
## Horsepower (hp) -0.015 -0.025*
## (0.015) (0.013)
##
## Constant 34.185*** 30.736***
## (2.591) (1.332)
##
## ------------------------------------------------
## Observations 32 32
## R2 0.768 0.748
## Adjusted R2 0.743 0.731
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
In the Correct Model, the coefficient for
disp
is -0.019, indicating that for every unit increase in
disp
, the mpg decreases slightly.
In the Incorrect Model, the coefficient for
disp
is -0.030, which is more negative than in the
Correct Model.
Here, cylinders (cyl) is omitted from the model. Cylinders are likely correlated with both miles per gallon (mpg) and displacement (disp). Generally, cars with more cylinders tend to have larger displacement and lower fuel efficiency (mpg).
cor(df$disp, df$wt)
## [1] 0.8879799
cor(df$mpg, df$wt)
## [1] -0.8676594
# Run Full Model
full_model <- lm(mpg ~ disp + wt, data = df)
summary(full_model)
##
## Call:
## lm(formula = mpg ~ disp + wt, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4087 -2.3243 -0.7683 1.7721 6.3484
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.96055 2.16454 16.151 4.91e-16 ***
## disp -0.01773 0.00919 -1.929 0.06362 .
## wt -3.35082 1.16413 -2.878 0.00743 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.917 on 29 degrees of freedom
## Multiple R-squared: 0.7809, Adjusted R-squared: 0.7658
## F-statistic: 51.69 on 2 and 29 DF, p-value: 2.744e-10
# Run Reduced Model
reduced_model <- lm(mpg ~ disp, data = df)
summary(reduced_model)
##
## Call:
## lm(formula = mpg ~ disp, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.8922 -2.2022 -0.9631 1.6272 7.2305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.599855 1.229720 24.070 < 2e-16 ***
## disp -0.041215 0.004712 -8.747 9.38e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.251 on 30 degrees of freedom
## Multiple R-squared: 0.7183, Adjusted R-squared: 0.709
## F-statistic: 76.51 on 1 and 30 DF, p-value: 9.38e-10
# Compare coefficients
coef_full <- coef(full_model)
coef_reduced <- coef(reduced_model)
coef_full["disp"]
## disp
## -0.01772474
coef_reduced["disp"]
## disp
## -0.04121512
There is no big difference.
model1 <- lm(mpg ~ disp, data = df)
# Second Model: mpg on disp and hp
model2 <- lm(mpg ~ disp + hp, data = df)
# Summarize the models side by side
stargazer(model1, model2, type = "text",
title = "Comparison of Regression Models",
dep.var.labels = "Miles per Gallon (mpg)",
cov.labels = c("Displacement (disp)", "Horsepower (hp)"),
omit.stat = c("f", "ser"),
single.row = TRUE)
##
## Comparison of Regression Models
## ================================================
## Dependent variable:
## -----------------------------------
## Miles per Gallon (mpg)
## (1) (2)
## ------------------------------------------------
## disp -0.041*** (0.005) -0.030*** (0.007)
## hp -0.025* (0.013)
## Constant 29.600*** (1.230) 30.736*** (1.332)
## ------------------------------------------------
## Observations 32 32
## R2 0.718 0.748
## Adjusted R2 0.709 0.731
## ================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
##
## Comparison of Regression Models
## ===================================
## Displacement (disp) Horsepower (hp)
## -----------------------------------
cor(df$hp, df$disp)
## [1] 0.7909486
No significant difference noticed.