Download the dataset from here.
raw_df = read_dta('/Users/g/Desktop/GSS7218_R1.dta')
df = raw_df
# the 2012 GSS made a dumb coding change
df$sexfreq[which(df$partners == 0 & df$year == 2012)] = 0
df = df %>%
select(
sexfreq,
sex,
year,
income,
rincome,
age,
race,
marital,
wtssall
) %>%
mutate(
sex=as_factor(sex),
sexfreq=as_factor(sexfreq),
income=as_factor(income),
rincome=as_factor(rincome),
race=as_factor(race),
marital=as_factor(marital),
age=as.numeric(age),
age_bracket=cut(age, c(17, 29, 39, 49, 59, 89)),
wtssall=as.numeric(wtssall),
hadsex=ifelse(sexfreq == 'NOT AT ALL', 0, 1),
year_anchored=year - min(year)
) %>%
filter(
!(sexfreq %in% c('NA', 'DK', 'IAP')),
marital != 'NA',
!is.na(sexfreq)
)
Mice that shit
imp = mice(df, method='pmm', m=1)
##
## iter imp variable
## 1 1 income rincome age age_bracket
## 2 1 income rincome age age_bracket
## 3 1 income rincome age age_bracket
## 4 1 income rincome age age_bracket
## 5 1 income rincome age age_bracket
## Warning: Number of logged events: 21
df2 = complete(imp)
yearly = df2 %>%
group_by(year) %>%
summarize(hadsex2=mean(hadsex), hadsex=sum(hadsex*wtssall)/sum(wtssall), n=n())
print(yearly)
## # A tibble: 17 x 4
## year hadsex2 hadsex n
## <dbl> <dbl> <dbl> <int>
## 1 1989 0.781 0.814 1361
## 2 1990 0.801 0.823 552
## 3 1991 0.766 0.813 1250
## 4 1993 0.784 0.824 1428
## 5 1994 0.786 0.826 2633
## 6 1996 0.812 0.850 2527
## 7 1998 0.778 0.818 2320
## 8 2000 0.775 0.818 2263
## 9 2002 0.775 0.813 2151
## 10 2004 0.789 0.824 2109
## 11 2006 0.745 0.795 2333
## 12 2008 0.755 0.807 1683
## 13 2010 0.749 0.793 1752
## 14 2012 0.725 0.769 1601
## 15 2014 0.743 0.777 2220
## 16 2016 0.743 0.777 1711
## 17 2018 0.731 0.768 1357
df2 %>%
group_by(year) %>%
summarize(hadsex=sum(hadsex*wtssall)/sum(wtssall)) %>%
ggplot(aes(x=year, y=hadsex)) +
geom_line() +
theme_minimal() +
labs(title='Percentage of Americans having sex in last year, GSS, 1989-2018')
Drop the oldest age bracket
df3 = df2 %>%
filter(age_bracket != '(59,89]')
df3 %>%
group_by(year) %>%
summarize(hadsex=sum(hadsex*wtssall)/sum(wtssall)) %>%
ggplot(aes(x=year, y=hadsex)) +
geom_line() +
theme_minimal() +
labs(title='Percentage of Americans under 60 having sex in last year, GSS, 1989-2018')
df3 %>%
group_by(year, age_bracket) %>%
summarize(hadsex=sum(hadsex*wtssall)/sum(wtssall)) %>%
ggplot(aes(x=year, y=hadsex)) +
geom_line() +
geom_smooth(method=lm, se=FALSE) +
facet_wrap(~age_bracket) +
theme_bw() +
labs(title='Percentage of Americans having sex in last year by age bracket, GSS, 1989-2018')
summary(
lm(
hadsex~year_anchored*age_bracket,
data=df3,
weights=wtssall
)
)
##
## Call:
## lm(formula = hadsex ~ year_anchored * age_bracket, data = df3,
## weights = wtssall)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -2.16955 0.05973 0.09442 0.14259 0.51873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9201513 0.0146352 62.872 < 2e-16
## year_anchored -0.0020150 0.0004619 -4.362 1.29e-05
## age_bracket(29,39] 0.0528946 0.0208347 2.539 0.0111
## age_bracket(39,49] 0.0130940 0.0216178 0.606 0.5447
## age_bracket(49,59] -0.0422188 0.0235444 -1.793 0.0730
## year_anchored:age_bracket(29,39] 0.0007100 0.0006626 1.071 0.2840
## year_anchored:age_bracket(39,49] 0.0010473 0.0006871 1.524 0.1274
## year_anchored:age_bracket(49,59] -0.0005230 0.0007244 -0.722 0.4703
##
## (Intercept) ***
## year_anchored ***
## age_bracket(29,39] *
## age_bracket(39,49]
## age_bracket(49,59] .
## year_anchored:age_bracket(29,39]
## year_anchored:age_bracket(39,49]
## year_anchored:age_bracket(49,59]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3295 on 24103 degrees of freedom
## Multiple R-squared: 0.02456, Adjusted R-squared: 0.02428
## F-statistic: 86.7 on 7 and 24103 DF, p-value: < 2.2e-16
df3 %>%
group_by(year, age_bracket, hadsex) %>%
summarize(n=n()) %>%
ggplot(aes(x=year, y=n, fill=factor(hadsex))) +
geom_col(position='dodge') +
facet_wrap(~age_bracket) +
theme_bw() +
labs(title='Number of respondents for each category by age by year, GSS, 1989-2018')
No interactions
mod = lm(
hadsex~sex + rincome + marital + race,
data=df3,
weights=wtssall
)
df3$hadsex_resid = residuals(mod)
df3$hadsex_pred = predict(mod)
df3 %>%
group_by(year, age_bracket) %>%
summarize(hadsex_resid=sum(hadsex_resid * wtssall) / sum(wtssall), n=n()) %>%
ggplot(aes(x=year, y=hadsex_resid)) +
geom_line() +
geom_smooth(method=lm, se=FALSE) +
facet_wrap(~age_bracket) +
theme_bw() +
labs(title='Residuals from LPM, y = f(sex, rincome, marital, race) + e, GSS, 1989-2018')
summary(
lm(
hadsex_resid~year_anchored*age_bracket,
data=df3,
weights=wtssall
)
)
##
## Call:
## lm(formula = hadsex_resid ~ year_anchored * age_bracket, data = df3,
## weights = wtssall)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -1.98057 -0.01330 0.08315 0.16997 0.86976
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0502418 0.0137500 3.654 0.000259
## year_anchored -0.0004866 0.0004340 -1.121 0.262223
## age_bracket(29,39] -0.0038167 0.0195745 -0.195 0.845408
## age_bracket(39,49] -0.0598419 0.0203102 -2.946 0.003218
## age_bracket(49,59] -0.1021274 0.0221203 -4.617 3.92e-06
## year_anchored:age_bracket(29,39] 0.0002252 0.0006225 0.362 0.717562
## year_anchored:age_bracket(39,49] 0.0007009 0.0006455 1.086 0.277570
## year_anchored:age_bracket(49,59] -0.0009999 0.0006806 -1.469 0.141826
##
## (Intercept) ***
## year_anchored
## age_bracket(29,39]
## age_bracket(39,49] **
## age_bracket(49,59] ***
## year_anchored:age_bracket(29,39]
## year_anchored:age_bracket(39,49]
## year_anchored:age_bracket(49,59]
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3096 on 24103 degrees of freedom
## Multiple R-squared: 0.02885, Adjusted R-squared: 0.02857
## F-statistic: 102.3 on 7 and 24103 DF, p-value: < 2.2e-16
mod = lm(
hadsex~factor(year) + sex + age + I(age^2) + income + marital + race,
data=df3,
weights=wtssall
)
df3$hadsex_resid = residuals(mod)
df3$hadsex_pred = predict(mod)
df3 %>%
group_by(year, age_bracket) %>%
summarize(hadsex_resid=sum(hadsex_resid * wtssall) / sum(wtssall), n=n()) %>%
ggplot(aes(x=year, y=hadsex_resid)) +
geom_line() +
geom_smooth(method=lm, se=FALSE) +
facet_wrap(~age_bracket) +
theme_minimal() +
labs(title='Residuals from LPM, y = f(sex, rincome, marital, race, age, age^2) + e, GSS, 1989-2018')
Let’s predict the residuals
hist(log(df3$hadsex_resid^2))
Whomst is the hardest to predict?
df3 %>%
group_by(year, age_bracket) %>%
mutate(
resid_sd=sum(
wtssall * (hadsex_resid - sum(wtssall * hadsex_resid) / sum(wtssall))^2
) / sum(wtssall)
) %>%
ggplot(aes(x=year, y=resid_sd)) +
geom_line() +
facet_wrap(~age_bracket)
Let’s MODEL THE LOG SQUARED ERROR
tmp = df3 %>%
mutate(
log_sq_hadsex_resid=log(hadsex_resid^2)
)
err_mod = lm(
log_sq_hadsex_resid~sex + age_bracket + income + marital + race,
data=tmp,
weights=wtssall
)
summary(err_mod)
##
## Call:
## lm(formula = log_sq_hadsex_resid ~ sex + age_bracket + income +
## marital + race, data = tmp, weights = wtssall)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -16.0906 -0.6959 -0.1025 0.9256 12.2105
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -7.151676 0.104950 -68.144 < 2e-16 ***
## sexfemale -0.102261 0.022536 -4.538 5.72e-06 ***
## age_bracket(29,39] -0.195845 0.032966 -5.941 2.87e-09 ***
## age_bracket(39,49] 0.286889 0.035176 8.156 3.64e-16 ***
## age_bracket(49,59] 2.768249 0.037579 73.664 < 2e-16 ***
## income$1000 TO 2999 -0.050486 0.135918 -0.371 0.7103
## income$3000 TO 3999 -0.167652 0.148191 -1.131 0.2579
## income$4000 TO 4999 -0.218928 0.159394 -1.374 0.1696
## income$5000 TO 5999 0.167837 0.150753 1.113 0.2656
## income$6000 TO 6999 0.331158 0.153168 2.162 0.0306 *
## income$7000 TO 7999 -0.217147 0.150677 -1.441 0.1496
## income$8000 TO 9999 -0.013125 0.129201 -0.102 0.9191
## income$10000 - 14999 -0.184120 0.108398 -1.699 0.0894 .
## income$15000 - 19999 -0.172180 0.108926 -1.581 0.1140
## income$20000 - 24999 -0.242651 0.107708 -2.253 0.0243 *
## income$25000 OR MORE -0.152041 0.100709 -1.510 0.1311
## maritalwidowed 4.185582 0.089064 46.995 < 2e-16 ***
## maritaldivorced 3.927224 0.036857 106.553 < 2e-16 ***
## maritalseparated 3.539129 0.067945 52.088 < 2e-16 ***
## maritalNEVER MARRIED 4.734390 0.030870 153.367 < 2e-16 ***
## raceblack -0.082671 0.033603 -2.460 0.0139 *
## raceother -0.002466 0.039288 -0.063 0.9499
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.759 on 24089 degrees of freedom
## Multiple R-squared: 0.646, Adjusted R-squared: 0.6457
## F-statistic: 2093 on 21 and 24089 DF, p-value: < 2.2e-16
coefplot(err_mod, intercept=FALSE) +
theme_bw() +
labs(title='What factors predict log squared errors (linear model)? \nRef grps: white, married, < $1k, 18-29, male')
Lots of interactions
mod = lm(
hadsex~factor(year) + sex + age + I(age^2) + income + marital + race + sex*marital + sex*income + marital*income + marital*age + marital*I(age^2),
data=df3,
weights=wtssall
)
df3$hadsex_resid = residuals(mod)
df3$hadsex_pred = predict(mod)
df3 %>%
group_by(year, age_bracket) %>%
summarize(hadsex_resid=sum(hadsex_resid * wtssall) / sum(wtssall), n=n()) %>%
ggplot(aes(x=year, y=hadsex_resid)) +
geom_line() +
geom_smooth(method=lm, se=FALSE) +
facet_wrap(~age_bracket) +
theme_minimal()