GSS Sexfreq analysis

Download the dataset from here.

raw_df = read_dta('/Users/g/Desktop/GSS7218_R1.dta')

df = raw_df

# the 2012 GSS made a dumb coding change
df$sexfreq[which(df$partners == 0 & df$year == 2012)] = 0

df = df %>%
    select(
        sexfreq,
        sex,
        year,
        income,
        rincome,
        age,
        race,
        marital,
        wtssall
    ) %>%
    mutate(
        sex=as_factor(sex),
        sexfreq=as_factor(sexfreq),
        income=as_factor(income),
        rincome=as_factor(rincome),
        race=as_factor(race),
        marital=as_factor(marital),
        age=as.numeric(age),
        age_bracket=cut(age, c(17, 29, 39, 49, 59, 89)),
        wtssall=as.numeric(wtssall),
        hadsex=ifelse(sexfreq == 'NOT AT ALL', 0, 1),
        year_anchored=year - min(year)
    ) %>%
    filter(
        !(sexfreq %in% c('NA', 'DK', 'IAP')),
        marital != 'NA',
        !is.na(sexfreq)
    )

Mice that shit

imp = mice(df, method='pmm', m=1)

## 
##  iter imp variable
##   1   1  income  rincome  age  age_bracket
##   2   1  income  rincome  age  age_bracket
##   3   1  income  rincome  age  age_bracket
##   4   1  income  rincome  age  age_bracket
##   5   1  income  rincome  age  age_bracket

## Warning: Number of logged events: 21

df2 = complete(imp)

Descriptives

yearly = df2 %>%
    group_by(year) %>%
    summarize(hadsex2=mean(hadsex), hadsex=sum(hadsex*wtssall)/sum(wtssall), n=n()) 
print(yearly)

## # A tibble: 17 x 4
##     year hadsex2 hadsex     n
##    <dbl>   <dbl>  <dbl> <int>
##  1  1989   0.781  0.814  1361
##  2  1990   0.801  0.823   552
##  3  1991   0.766  0.813  1250
##  4  1993   0.784  0.824  1428
##  5  1994   0.786  0.826  2633
##  6  1996   0.812  0.850  2527
##  7  1998   0.778  0.818  2320
##  8  2000   0.775  0.818  2263
##  9  2002   0.775  0.813  2151
## 10  2004   0.789  0.824  2109
## 11  2006   0.745  0.795  2333
## 12  2008   0.755  0.807  1683
## 13  2010   0.749  0.793  1752
## 14  2012   0.725  0.769  1601
## 15  2014   0.743  0.777  2220
## 16  2016   0.743  0.777  1711
## 17  2018   0.731  0.768  1357

df2 %>%
    group_by(year) %>%
    summarize(hadsex=sum(hadsex*wtssall)/sum(wtssall)) %>%
    ggplot(aes(x=year, y=hadsex)) +
    geom_line() +
    theme_minimal() +
    labs(title='Percentage of Americans having sex in last year, GSS, 1989-2018')

Drop the oldest age bracket

df3 = df2 %>%
    filter(age_bracket != '(59,89]')

df3 %>%
    group_by(year) %>%
    summarize(hadsex=sum(hadsex*wtssall)/sum(wtssall)) %>%
    ggplot(aes(x=year, y=hadsex)) +
    geom_line() +
    theme_minimal() +
    labs(title='Percentage of Americans under 60 having sex in last year, GSS, 1989-2018')

df3 %>%
    group_by(year, age_bracket) %>%
    summarize(hadsex=sum(hadsex*wtssall)/sum(wtssall)) %>%
    ggplot(aes(x=year, y=hadsex)) +
    geom_line() +
    geom_smooth(method=lm, se=FALSE) +
    facet_wrap(~age_bracket) +
    theme_bw() +
    labs(title='Percentage of Americans having sex in last year by age bracket, GSS, 1989-2018')

summary(
    lm(
        hadsex~year_anchored*age_bracket,
        data=df3,
        weights=wtssall
    )
)

## 
## Call:
## lm(formula = hadsex ~ year_anchored * age_bracket, data = df3, 
##     weights = wtssall)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.16955  0.05973  0.09442  0.14259  0.51873 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       0.9201513  0.0146352  62.872  < 2e-16
## year_anchored                    -0.0020150  0.0004619  -4.362 1.29e-05
## age_bracket(29,39]                0.0528946  0.0208347   2.539   0.0111
## age_bracket(39,49]                0.0130940  0.0216178   0.606   0.5447
## age_bracket(49,59]               -0.0422188  0.0235444  -1.793   0.0730
## year_anchored:age_bracket(29,39]  0.0007100  0.0006626   1.071   0.2840
## year_anchored:age_bracket(39,49]  0.0010473  0.0006871   1.524   0.1274
## year_anchored:age_bracket(49,59] -0.0005230  0.0007244  -0.722   0.4703
##                                     
## (Intercept)                      ***
## year_anchored                    ***
## age_bracket(29,39]               *  
## age_bracket(39,49]                  
## age_bracket(49,59]               .  
## year_anchored:age_bracket(29,39]    
## year_anchored:age_bracket(39,49]    
## year_anchored:age_bracket(49,59]    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3295 on 24103 degrees of freedom
## Multiple R-squared:  0.02456,    Adjusted R-squared:  0.02428 
## F-statistic:  86.7 on 7 and 24103 DF,  p-value: < 2.2e-16

Residual plots

df3 %>%
    group_by(year, age_bracket, hadsex) %>%
    summarize(n=n()) %>%
    ggplot(aes(x=year, y=n, fill=factor(hadsex))) +
    geom_col(position='dodge') +
    facet_wrap(~age_bracket) +
    theme_bw() +
    labs(title='Number of respondents for each category by age by year, GSS, 1989-2018')

No interactions

mod = lm(
    hadsex~sex + rincome + marital + race,
    data=df3,
    weights=wtssall
)

df3$hadsex_resid = residuals(mod)
df3$hadsex_pred = predict(mod)

df3 %>%
    group_by(year, age_bracket) %>%
    summarize(hadsex_resid=sum(hadsex_resid * wtssall) / sum(wtssall), n=n()) %>%
    ggplot(aes(x=year, y=hadsex_resid)) +
    geom_line() +
    geom_smooth(method=lm, se=FALSE) +
    facet_wrap(~age_bracket) +
    theme_bw() +
    labs(title='Residuals from LPM, y = f(sex, rincome, marital, race) + e, GSS, 1989-2018')

summary(
    lm(
        hadsex_resid~year_anchored*age_bracket,
        data=df3,
        weights=wtssall
    )
)

## 
## Call:
## lm(formula = hadsex_resid ~ year_anchored * age_bracket, data = df3, 
##     weights = wtssall)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.98057 -0.01330  0.08315  0.16997  0.86976 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)
## (Intercept)                       0.0502418  0.0137500   3.654 0.000259
## year_anchored                    -0.0004866  0.0004340  -1.121 0.262223
## age_bracket(29,39]               -0.0038167  0.0195745  -0.195 0.845408
## age_bracket(39,49]               -0.0598419  0.0203102  -2.946 0.003218
## age_bracket(49,59]               -0.1021274  0.0221203  -4.617 3.92e-06
## year_anchored:age_bracket(29,39]  0.0002252  0.0006225   0.362 0.717562
## year_anchored:age_bracket(39,49]  0.0007009  0.0006455   1.086 0.277570
## year_anchored:age_bracket(49,59] -0.0009999  0.0006806  -1.469 0.141826
##                                     
## (Intercept)                      ***
## year_anchored                       
## age_bracket(29,39]                  
## age_bracket(39,49]               ** 
## age_bracket(49,59]               ***
## year_anchored:age_bracket(29,39]    
## year_anchored:age_bracket(39,49]    
## year_anchored:age_bracket(49,59]    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3096 on 24103 degrees of freedom
## Multiple R-squared:  0.02885,    Adjusted R-squared:  0.02857 
## F-statistic: 102.3 on 7 and 24103 DF,  p-value: < 2.2e-16

mod = lm(
    hadsex~factor(year) + sex + age + I(age^2) + income + marital + race,
    data=df3,
    weights=wtssall
)

df3$hadsex_resid = residuals(mod)
df3$hadsex_pred = predict(mod)

df3 %>%
    group_by(year, age_bracket) %>%
    summarize(hadsex_resid=sum(hadsex_resid * wtssall) / sum(wtssall), n=n()) %>%
    ggplot(aes(x=year, y=hadsex_resid)) +
    geom_line() +
    geom_smooth(method=lm, se=FALSE) +
    facet_wrap(~age_bracket) +
    theme_minimal() +
    labs(title='Residuals from LPM, y = f(sex, rincome, marital, race, age, age^2) + e, GSS, 1989-2018')

Let’s predict the residuals

hist(log(df3$hadsex_resid^2))

Whomst is the hardest to predict?

df3 %>%
    group_by(year, age_bracket) %>%
    mutate(
        resid_sd=sum(
            wtssall * (hadsex_resid - sum(wtssall * hadsex_resid) / sum(wtssall))^2
        ) / sum(wtssall)
    ) %>%
    ggplot(aes(x=year, y=resid_sd)) +
    geom_line() +
    facet_wrap(~age_bracket)

Let’s MODEL THE LOG SQUARED ERROR

tmp = df3 %>%
    mutate(
        log_sq_hadsex_resid=log(hadsex_resid^2)
    )
err_mod = lm(
    log_sq_hadsex_resid~sex + age_bracket + income + marital + race,
    data=tmp,
    weights=wtssall
)
summary(err_mod)

## 
## Call:
## lm(formula = log_sq_hadsex_resid ~ sex + age_bracket + income + 
##     marital + race, data = tmp, weights = wtssall)
## 
## Weighted Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.0906  -0.6959  -0.1025   0.9256  12.2105 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -7.151676   0.104950 -68.144  < 2e-16 ***
## sexfemale            -0.102261   0.022536  -4.538 5.72e-06 ***
## age_bracket(29,39]   -0.195845   0.032966  -5.941 2.87e-09 ***
## age_bracket(39,49]    0.286889   0.035176   8.156 3.64e-16 ***
## age_bracket(49,59]    2.768249   0.037579  73.664  < 2e-16 ***
## income$1000 TO 2999  -0.050486   0.135918  -0.371   0.7103    
## income$3000 TO 3999  -0.167652   0.148191  -1.131   0.2579    
## income$4000 TO 4999  -0.218928   0.159394  -1.374   0.1696    
## income$5000 TO 5999   0.167837   0.150753   1.113   0.2656    
## income$6000 TO 6999   0.331158   0.153168   2.162   0.0306 *  
## income$7000 TO 7999  -0.217147   0.150677  -1.441   0.1496    
## income$8000 TO 9999  -0.013125   0.129201  -0.102   0.9191    
## income$10000 - 14999 -0.184120   0.108398  -1.699   0.0894 .  
## income$15000 - 19999 -0.172180   0.108926  -1.581   0.1140    
## income$20000 - 24999 -0.242651   0.107708  -2.253   0.0243 *  
## income$25000 OR MORE -0.152041   0.100709  -1.510   0.1311    
## maritalwidowed        4.185582   0.089064  46.995  < 2e-16 ***
## maritaldivorced       3.927224   0.036857 106.553  < 2e-16 ***
## maritalseparated      3.539129   0.067945  52.088  < 2e-16 ***
## maritalNEVER MARRIED  4.734390   0.030870 153.367  < 2e-16 ***
## raceblack            -0.082671   0.033603  -2.460   0.0139 *  
## raceother            -0.002466   0.039288  -0.063   0.9499    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.759 on 24089 degrees of freedom
## Multiple R-squared:  0.646,  Adjusted R-squared:  0.6457 
## F-statistic:  2093 on 21 and 24089 DF,  p-value: < 2.2e-16

coefplot(err_mod, intercept=FALSE) +
    theme_bw() +
    labs(title='What factors predict log squared errors (linear model)? \nRef grps: white, married, < $1k, 18-29, male')

Lots of interactions

mod = lm(
    hadsex~factor(year) + sex + age + I(age^2) + income + marital + race + sex*marital + sex*income + marital*income + marital*age + marital*I(age^2),
    data=df3,
    weights=wtssall
)

df3$hadsex_resid = residuals(mod)
df3$hadsex_pred = predict(mod)

df3 %>%
    group_by(year, age_bracket) %>%
    summarize(hadsex_resid=sum(hadsex_resid * wtssall) / sum(wtssall), n=n()) %>%
    ggplot(aes(x=year, y=hadsex_resid)) +
    geom_line() +
    geom_smooth(method=lm, se=FALSE) +
    facet_wrap(~age_bracket) +
    theme_minimal()

GSS Sexfreq analysis

George Berry

3/31/2019

Descriptives

Residual plots