Load Packages

library( pander )     # formatting tables
library( dplyr )      # data wrangling
library( stargazer )  # regression tables

Load Data

URL <- "https://raw.githubusercontent.com/DS4PS/cpp-523-fall-2019/master/labs/data/predicting-subjective-wellbeing.csv"
dat <- read.csv( URL, stringsAsFactors=F )

head( dat ) %>% pander()

Id	SWB	CSE	PANASpos	PANASneg	SJAS_hc	ND	RSE	PSS	NP
1	33	39	39	15	2	5	48	93	24
2	33	59	48	27	8	6	45	95	29
3	29	54	44	24	7	5	43	89	103
4	21	36	30	17	2	6	45	74	19
5	25	48	41	18	6	6	47	93	20
6	23	51.43	30	25	4.444	6	28	91	18

Solutions

Question 1.

Part 1

Consider a three-variable regression of Subjective Well-Being (SWB), Network Diversity (ND) and Perceived Social Support (PSS):

\(SWB=β_0+β_1 \cdot ND+β_2 \cdot PSS+e\)

m.full <- lm( SWB ~ ND + PSS, data=dat )

stargazer( m.full, 
           type = "html", digits=2,
           dep.var.caption = "DV: Subjective Well-Being",
           omit.stat = c("rsq","f","ser"),
           notes.label = "Standard errors in parentheses")

## 
## <table style="text-align:center"><tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td>DV: Subjective Well-Being</td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>SWB</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">ND</td><td>-0.05</td></tr>
## <tr><td style="text-align:left"></td><td>(0.19)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">PSS</td><td>0.32<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.03)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>-2.35</td></tr>
## <tr><td style="text-align:left"></td><td>(2.59)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>389</td></tr>
## <tr><td style="text-align:left">Adjusted R<sup>2</sup></td><td>0.21</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Standard errors in parentheses</td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>

head( dat[c("SWB","ND","PSS")] ) %>% pander

SWB	ND	PSS
33	5	93
33	6	95
29	5	89
21	6	74
25	6	93
23	6	91

Calculate the omitted variable bias on the Network Diversity (ND) coefficient that results from omitting the Perceived Social Support (PSS) variable from the regression.

\(SWB=b_0+b_1 \cdot ND+e\)

m.full <- lm( SWB ~ ND + PSS, data=dat )
m.naive <- lm( SWB ~ ND, data=dat )

stargazer( m.naive, m.full, 
           type = "text`11q", digits=2,
           dep.var.caption = "DV: Subjective Well-Being",
           omit.stat = c("rsq","f","ser"),
           notes.label = "Standard errors in parentheses")

% Error: ‘style’ must be either ‘latex’ (default), ‘html’ or ‘text.’

panel.cor <- function(x, y, digits=2, prefix="", cex.cor)
{
    usr <- par("usr"); on.exit(par(usr))
    par(usr = c(0, 1, 0, 1))
    r <- cor(x, y, use="pairwise.complete.obs")
    txt <- format(c(r, 0.123456789), digits=digits)[1]
    txt <- paste(prefix, txt, sep="")
    if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
    
    test <- cor.test(x,y)
    # borrowed from printCoefmat
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
                  cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
                  symbols = c("***", "**", "*", ".", " "))
    
    text(0.5, 0.5, txt, cex = 2 )
    text(.7, .8, Signif, cex=3, col=2)
}


panel.smooth <- function (x, y, col = par("col"), bg = NA, pch = par("pch"), 
  cex = 1, col.smooth = "red", span = 2/3, iter = 3, ...) 
{
  points(x, y, pch = 19, col = gray(0.5,0.5), 
         bg = bg, cex = 1.7)
  ok <- is.finite(x) & is.finite(y)
  if (any(ok)) 
    lines(stats::lowess(x[ok], y[ok], f = span, iter = iter), 
      col = col.smooth, lwd=2, ...)
}

pairs( dat[c("SWB","ND","PSS","CSE","RSE" )], 
       lower.panel=panel.smooth, upper.panel=panel.cor )

You can read these values from the table above.

B1 <- (0.52)  # replace with the appropriate value from the table above
b1 <- (0.32)  # replace with the appropriate value from the table above
bias <- b1 - B1
bias

## [1] -0.2

Part 2

Run the auxiliary regression to estimate \(\alpha_1\).

Calculate the bias using the omitted variable bias equation and show your math. You can check your results by comparing your answer to bias calculation from Part 1.

m.auxiliary <- lm( SWB ~ ND + PSS, data=dat )

stargazer( m.auxiliary,  
           type = "html", digits=2,
           dep.var.caption = "DV: PSS",
           omit.stat = c("rsq","f","ser"),
           notes.label = "Standard errors in parentheses")


	DV: PSS

	SWB

ND	-0.05
	(0.19)

PSS	0.32^***
	(0.03)

Constant	-2.35
	(2.59)


Observations	389
Adjusted R²	0.21

Standard errors in parentheses	p<0.1; p<0.05; p<0.01

a1 <- .19
B2 <- .03
bias <- a1 * B2
bias

[1] 0.0057

m.auxiliary <- lm( PSS ~ ND , data=dat )
summary( m.auxiliary )

## 
## Call:
## lm(formula = PSS ~ ND, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.322  -4.765   1.420   6.235  14.678 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  73.8780     1.6928  43.642  < 2e-16 ***
## ND            1.8146     0.2802   6.476 2.86e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.98 on 387 degrees of freedom
## Multiple R-squared:  0.09777,    Adjusted R-squared:  0.09544 
## F-statistic: 41.94 on 1 and 387 DF,  p-value: 2.862e-10

#Coefficients (below for the Auxiliary)

# naive regression in the example: TS = b0 + b1*CS
m.naive <- lm( SWB ~ ND, data=dat )
summary( m.naive )

## 
## Call:
## lm(formula = SWB ~ ND, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -17.565  -4.134   0.389   4.389  10.866 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  20.9950     1.1895  17.651   <2e-16 ***
## ND            0.5232     0.1969   2.657   0.0082 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.607 on 387 degrees of freedom
## Multiple R-squared:  0.01792,    Adjusted R-squared:  0.01538 
## F-statistic: 7.062 on 1 and 387 DF,  p-value: 0.008202

#Coefficients (below for the Naive)

# full regression: TS = B0 + B1*CS + B2*SES
m.full <- lm( SWB ~ ND + PSS, data=dat )
summary( m.full )

## 
## Call:
## lm(formula = SWB ~ ND + PSS, data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5727  -3.4203   0.2601   3.8957  14.1729 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.34974    2.58867  -0.908    0.365    
## ND          -0.05019    0.18538  -0.271    0.787    
## PSS          0.31599    0.03194   9.892   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.015 on 386 degrees of freedom
## Multiple R-squared:  0.2165, Adjusted R-squared:  0.2125 
## F-statistic: 53.34 on 2 and 386 DF,  p-value: < 2.2e-16

#Coefficients (below for the Full Regression)

#Calculate the bias using omitted variable 
# b1 = B1 + bias
# b1 - B1 = bias
b1 <- .523
B1 <- -0.050
b1 - B1

## [1] 0.573

# bias = a1*B2
a1 <- 1.81
B2 <- .316
a1*B2

## [1] 0.57196

Part 3

How good is the naive estimation of β1, the impact of network diversity on our happiness, in this case?

bias / B1  # rough measure of the magnitude of bias

Part 4

In the previous lecture we saw how the Class Size model lost significance when SES was added as a result of an increase in the standard errors. In this model Network Diversity also loses significance. Explain why.

Answer - I think network diversity loses significance due to the other coefficients SWB and PSS causes X to become less significant even though it may overlap in variance.

Question 2.

m.01 <- lm( SWB ~ CSE, data=dat )
m.02 <- lm( SWB ~ RSE, data=dat )
m.03 <- lm( SWB ~ CSE + RSE, data=dat )

stargazer( m.01, m.02, m.03, 
           type = "text", digits=2,
           dep.var.caption = "DV: Subjective Well-Being",
           omit.stat = c("rsq","f","ser"),
           notes.label = "Standard errors in parentheses")

## 
## ============================================================
##                                  DV: Subjective Well-Being  
##                                -----------------------------
##                                             SWB             
##                                   (1)        (2)      (3)   
## ------------------------------------------------------------
## CSE                             -0.11***            0.09*** 
##                                  (0.03)              (0.03) 
##                                                             
## RSE                                        0.55***  0.61*** 
##                                            (0.04)    (0.04) 
##                                                             
## Constant                        29.29***    1.62     -4.82* 
##                                  (1.65)    (1.54)    (2.68) 
##                                                             
## ------------------------------------------------------------
## Observations                      389        389      389   
## Adjusted R2                       0.02      0.36      0.37  
## ============================================================
## Standard errors in parentheses   *p<0.1; **p<0.05; ***p<0.01

Part 1

What happened to the slope of CSE?
The slope of CSE was negative yet significant due to the stars notated.

What happened to the standard error of CSE?
The standard error is smaller which shows it’s more accurate.

How would our assessment of CSE change after we control for baseline self-esteem? For example, if we are a psychologist working with students should we worry if we observe a case where a person has a high need for approval from others? Will it impact their happiness? Or should we focus on other things?

I think we should focus on other things since it’s obvious PSS and SWB are effective based on specific coefficients.The assessment of CSE could make the baseline of self-esteem more accurate if there is a high need for approval of others.

Part 2

In Quesion 2 the control variable cased the slope estimate for Network Diversity to shift to the left toward the null hypothesis (slope=0, no impact) and as a result it lost statistical significance.

What happens in this case? Why did that happen? Drawing the coefficient plot might help. The correlation changed between the coefficients, which led to the covariance changing and the slope.

library(dplyr)
m1 <- lm(SWB ~ ND + PSS + CSE, data=dat )
source("https://www.r-statistics.com/wp-content/uploads/2010/07/coefplot.r.txt")

coefplot(m1, add=TRUE, col.pts="red", intercept=TRUE)

Part 3

Explain the direction of bias in this model using the formula for omitted variable bias: what must be true about the sign of the correlation between RSE and CSE to get these results? The correlation between RSE and CSE is that CSE can be more significant than RSE yet still can be bias.

\(bias = \alpha_1 \cdot B_2\)

Where \(\alpha_1\) represents the correlation between CSE and RSE, and \(B_2\) represents the correlation between RSE and the outcome Subjective Well-Being. Specifically, correlations and slopes always have the same sign.

m.02 <- lm( SWB ~ RSE, data=dat )
coefplot(m.02, add=TRUE, col.pts="red", intercept=TRUE)

m.03 <- lm( SWB ~ CSE + RSE, data=dat )
coefplot(m.03, add=TRUE, col.pts="red", intercept=TRUE)

\(\alpha_1\)	\(B_2\)	Sign of Bias
(+)	(+)	(+)
(-)	(-)	(+)
(+)	(-)	(-)
(-)	(+)	(-)

\(b_1 = B_1 + bias\)

Therefore:

If bias (+) then \(b_1 > B_1\)

If bias (-) then \(b_1 < B_1\)

Submission Instructions

After you have completed your lab, knit your RMD file. Login to Canvas at http://canvas.asu.edu and navigate to the assignments tab in the course repository. Upload your RMD and your HTML files to the appropriate lab submission link.

Remember to:

name your files according to the convention: Lab-##-LastName.Rmd
show your solution, include your code.
do not print excessive output (like a full data set).

Lab 04 - Omitted Variable Bias

Britnie Britton

12 June, 2022

Load Packages

Load Data

Solutions

Question 1.

Part 1

Part 2

Part 3

Part 4

Question 2.

Part 1

Part 2

Part 3

Submission Instructions