library( pander ) # formatting tables
library( dplyr ) # data wrangling
library( stargazer ) # regression tables
<- "https://raw.githubusercontent.com/DS4PS/cpp-523-fall-2019/master/labs/data/predicting-subjective-wellbeing.csv"
URL <- read.csv( URL, stringsAsFactors=F )
dat
head( dat ) %>% pander()
Id | SWB | CSE | PANASpos | PANASneg | SJAS_hc | ND | RSE | PSS | NP |
---|---|---|---|---|---|---|---|---|---|
1 | 33 | 39 | 39 | 15 | 2 | 5 | 48 | 93 | 24 |
2 | 33 | 59 | 48 | 27 | 8 | 6 | 45 | 95 | 29 |
3 | 29 | 54 | 44 | 24 | 7 | 5 | 43 | 89 | 103 |
4 | 21 | 36 | 30 | 17 | 2 | 6 | 45 | 74 | 19 |
5 | 25 | 48 | 41 | 18 | 6 | 6 | 47 | 93 | 20 |
6 | 23 | 51.43 | 30 | 25 | 4.444 | 6 | 28 | 91 | 18 |
Consider a three-variable regression of Subjective Well-Being (SWB), Network Diversity (ND) and Perceived Social Support (PSS):
\(SWB=β_0+β_1 \cdot ND+β_2 \cdot PSS+e\)
<- lm( SWB ~ ND + PSS, data=dat )
m.full
stargazer( m.full,
type = "html", digits=2,
dep.var.caption = "DV: Subjective Well-Being",
omit.stat = c("rsq","f","ser"),
notes.label = "Standard errors in parentheses")
##
## <table style="text-align:center"><tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left"></td><td>DV: Subjective Well-Being</td></tr>
## <tr><td></td><td colspan="1" style="border-bottom: 1px solid black"></td></tr>
## <tr><td style="text-align:left"></td><td>SWB</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">ND</td><td>-0.05</td></tr>
## <tr><td style="text-align:left"></td><td>(0.19)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">PSS</td><td>0.32<sup>***</sup></td></tr>
## <tr><td style="text-align:left"></td><td>(0.03)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td style="text-align:left">Constant</td><td>-2.35</td></tr>
## <tr><td style="text-align:left"></td><td>(2.59)</td></tr>
## <tr><td style="text-align:left"></td><td></td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Observations</td><td>389</td></tr>
## <tr><td style="text-align:left">Adjusted R<sup>2</sup></td><td>0.21</td></tr>
## <tr><td colspan="2" style="border-bottom: 1px solid black"></td></tr><tr><td style="text-align:left">Standard errors in parentheses</td><td style="text-align:right"><sup>*</sup>p<0.1; <sup>**</sup>p<0.05; <sup>***</sup>p<0.01</td></tr>
## </table>
head( dat[c("SWB","ND","PSS")] ) %>% pander
SWB | ND | PSS |
---|---|---|
33 | 5 | 93 |
33 | 6 | 95 |
29 | 5 | 89 |
21 | 6 | 74 |
25 | 6 | 93 |
23 | 6 | 91 |
Calculate the omitted variable bias on the Network Diversity (ND) coefficient that results from omitting the Perceived Social Support (PSS) variable from the regression.
\(SWB=b_0+b_1 \cdot ND+e\)
<- lm( SWB ~ ND + PSS, data=dat )
m.full <- lm( SWB ~ ND, data=dat )
m.naive
stargazer( m.naive, m.full,
type = "text`11q", digits=2,
dep.var.caption = "DV: Subjective Well-Being",
omit.stat = c("rsq","f","ser"),
notes.label = "Standard errors in parentheses")
% Error: ‘style’ must be either ‘latex’ (default), ‘html’ or ‘text.’
<- function(x, y, digits=2, prefix="", cex.cor)
panel.cor
{<- par("usr"); on.exit(par(usr))
usr par(usr = c(0, 1, 0, 1))
<- cor(x, y, use="pairwise.complete.obs")
r <- format(c(r, 0.123456789), digits=digits)[1]
txt <- paste(prefix, txt, sep="")
txt if(missing(cex.cor)) cex <- 0.8/strwidth(txt)
<- cor.test(x,y)
test # borrowed from printCoefmat
<- symnum(test$p.value, corr = FALSE, na = FALSE,
Signif cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1),
symbols = c("***", "**", "*", ".", " "))
text(0.5, 0.5, txt, cex = 2 )
text(.7, .8, Signif, cex=3, col=2)
}
<- function (x, y, col = par("col"), bg = NA, pch = par("pch"),
panel.smooth cex = 1, col.smooth = "red", span = 2/3, iter = 3, ...)
{points(x, y, pch = 19, col = gray(0.5,0.5),
bg = bg, cex = 1.7)
<- is.finite(x) & is.finite(y)
ok if (any(ok))
lines(stats::lowess(x[ok], y[ok], f = span, iter = iter),
col = col.smooth, lwd=2, ...)
}
pairs( dat[c("SWB","ND","PSS","CSE","RSE" )],
lower.panel=panel.smooth, upper.panel=panel.cor )
You can read these values from the table above.
<- (0.52) # replace with the appropriate value from the table above
B1 <- (0.32) # replace with the appropriate value from the table above
b1 <- b1 - B1
bias bias
## [1] -0.2
Run the auxiliary regression to estimate \(\alpha_1\).
Calculate the bias using the omitted variable bias equation and show your math. You can check your results by comparing your answer to bias calculation from Part 1.
<- lm( SWB ~ ND + PSS, data=dat )
m.auxiliary
stargazer( m.auxiliary,
type = "html", digits=2,
dep.var.caption = "DV: PSS",
omit.stat = c("rsq","f","ser"),
notes.label = "Standard errors in parentheses")
DV: PSS | |
SWB | |
ND | -0.05 |
(0.19) | |
PSS | 0.32*** |
(0.03) | |
Constant | -2.35 |
(2.59) | |
Observations | 389 |
Adjusted R2 | 0.21 |
Standard errors in parentheses | p<0.1; p<0.05; p<0.01 |
<- .19
a1 <- .03
B2 <- a1 * B2
bias bias
[1] 0.0057
<- lm( PSS ~ ND , data=dat )
m.auxiliary summary( m.auxiliary )
##
## Call:
## lm(formula = PSS ~ ND, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.322 -4.765 1.420 6.235 14.678
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 73.8780 1.6928 43.642 < 2e-16 ***
## ND 1.8146 0.2802 6.476 2.86e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 7.98 on 387 degrees of freedom
## Multiple R-squared: 0.09777, Adjusted R-squared: 0.09544
## F-statistic: 41.94 on 1 and 387 DF, p-value: 2.862e-10
#Coefficients (below for the Auxiliary)
# naive regression in the example: TS = b0 + b1*CS
<- lm( SWB ~ ND, data=dat )
m.naive summary( m.naive )
##
## Call:
## lm(formula = SWB ~ ND, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.565 -4.134 0.389 4.389 10.866
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 20.9950 1.1895 17.651 <2e-16 ***
## ND 0.5232 0.1969 2.657 0.0082 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.607 on 387 degrees of freedom
## Multiple R-squared: 0.01792, Adjusted R-squared: 0.01538
## F-statistic: 7.062 on 1 and 387 DF, p-value: 0.008202
#Coefficients (below for the Naive)
# full regression: TS = B0 + B1*CS + B2*SES
<- lm( SWB ~ ND + PSS, data=dat )
m.full summary( m.full )
##
## Call:
## lm(formula = SWB ~ ND + PSS, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5727 -3.4203 0.2601 3.8957 14.1729
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.34974 2.58867 -0.908 0.365
## ND -0.05019 0.18538 -0.271 0.787
## PSS 0.31599 0.03194 9.892 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.015 on 386 degrees of freedom
## Multiple R-squared: 0.2165, Adjusted R-squared: 0.2125
## F-statistic: 53.34 on 2 and 386 DF, p-value: < 2.2e-16
#Coefficients (below for the Full Regression)
#Calculate the bias using omitted variable
# b1 = B1 + bias
# b1 - B1 = bias
<- .523
b1 <- -0.050
B1 - B1 b1
## [1] 0.573
# bias = a1*B2
<- 1.81
a1 <- .316
B2 *B2 a1
## [1] 0.57196
How good is the naive estimation of β1, the impact of network diversity on our happiness, in this case?
/ B1 # rough measure of the magnitude of bias bias
In the previous lecture we saw how the Class Size model lost significance when SES was added as a result of an increase in the standard errors. In this model Network Diversity also loses significance. Explain why.
Answer - I think network diversity loses significance due to the
other coefficients SWB and PSS causes X to become less significant even
though it may overlap in variance.
.01 <- lm( SWB ~ CSE, data=dat )
m.02 <- lm( SWB ~ RSE, data=dat )
m.03 <- lm( SWB ~ CSE + RSE, data=dat )
m
stargazer( m.01, m.02, m.03,
type = "text", digits=2,
dep.var.caption = "DV: Subjective Well-Being",
omit.stat = c("rsq","f","ser"),
notes.label = "Standard errors in parentheses")
##
## ============================================================
## DV: Subjective Well-Being
## -----------------------------
## SWB
## (1) (2) (3)
## ------------------------------------------------------------
## CSE -0.11*** 0.09***
## (0.03) (0.03)
##
## RSE 0.55*** 0.61***
## (0.04) (0.04)
##
## Constant 29.29*** 1.62 -4.82*
## (1.65) (1.54) (2.68)
##
## ------------------------------------------------------------
## Observations 389 389 389
## Adjusted R2 0.02 0.36 0.37
## ============================================================
## Standard errors in parentheses *p<0.1; **p<0.05; ***p<0.01
What happened to the slope of CSE?
The slope of CSE was negative yet significant due to the stars
notated.
What happened to the standard error of CSE?
The standard error is smaller which shows it’s more accurate.
How would our assessment of CSE change after we control for baseline self-esteem? For example, if we are a psychologist working with students should we worry if we observe a case where a person has a high need for approval from others? Will it impact their happiness? Or should we focus on other things?
I think we should focus on other things since it’s obvious PSS and SWB are effective based on specific coefficients.The assessment of CSE could make the baseline of self-esteem more accurate if there is a high need for approval of others.
In Quesion 2 the control variable cased the slope estimate for Network Diversity to shift to the left toward the null hypothesis (slope=0, no impact) and as a result it lost statistical significance.
What happens in this case? Why did that happen? Drawing the coefficient plot might help. The correlation changed between the coefficients, which led to the covariance changing and the slope.
library(dplyr)
<- lm(SWB ~ ND + PSS + CSE, data=dat )
m1 source("https://www.r-statistics.com/wp-content/uploads/2010/07/coefplot.r.txt")
coefplot(m1, add=TRUE, col.pts="red", intercept=TRUE)
Explain the direction of bias in this model using the formula for omitted variable bias: what must be true about the sign of the correlation between RSE and CSE to get these results? The correlation between RSE and CSE is that CSE can be more significant than RSE yet still can be bias.
\(bias = \alpha_1 \cdot B_2\)
Where \(\alpha_1\) represents the correlation between CSE and RSE, and \(B_2\) represents the correlation between RSE and the outcome Subjective Well-Being. Specifically, correlations and slopes always have the same sign.
.02 <- lm( SWB ~ RSE, data=dat )
mcoefplot(m.02, add=TRUE, col.pts="red", intercept=TRUE)
.03 <- lm( SWB ~ CSE + RSE, data=dat )
mcoefplot(m.03, add=TRUE, col.pts="red", intercept=TRUE)
\(\alpha_1\) | \(B_2\) | Sign of Bias |
---|---|---|
(+) | (+) | (+) |
(-) | (-) | (+) |
(+) | (-) | (-) |
(-) | (+) | (-) |
\(b_1 = B_1 + bias\)
Therefore:
If bias (+) then \(b_1 > B_1\)
If bias (-) then \(b_1 < B_1\)
After you have completed your lab, knit your RMD file. Login to Canvas at http://canvas.asu.edu and navigate to the assignments tab in the course repository. Upload your RMD and your HTML files to the appropriate lab submission link.
Remember to: