Computing standardized educational attainment by race, sex, and race of partner.
setwd('~')
setwd('Documents/rstuff/assortmate/usa')
default <- read_dta("census_1960.dta")
default <- default %>% filter(income_h > 0 & income_w > 0)
default <- read_dta("census_1940.dta")
years <- c(1940, 1960, 1970, 1980, 1990, 2000, 2008, 2009, 2010, 2011, 2012, 2013)
xx <- c()
for(i in 1:12) {
alive <- paste0('census_', years[i], '.dta')
default <- read_dta(alive)
default$cohort = years[i] - default$age_w
default$year <- years[i]
default <- default %>% filter(!is.na(fw_w))
xx <- rbind(xx, default)
}
xx2 <- data.frame(xx)
xx2 <- xx %>% select(educ_years_w, educ_years_h, educ_level_h, educ_level_w, cohort, race_h, race_w, income_h, income_w, year)
###############
default <- read_dta("cohab_2007.dta")
cor.test(default$educ_years_w, default$educ_years_h)
Pearson's product-moment correlation
data: default$educ_years_w and default$educ_years_h
t = 159.42, df = 37152, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6312551 0.6433316
sample estimates:
cor
0.6373324
cor.test(default$educ_level_w, default$educ_years_h)
Pearson's product-moment correlation
data: default$educ_level_w and default$educ_years_h
t = 151.4, df = 37263, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6108134 0.6233863
sample estimates:
cor
0.6171393
years <- c(2007, 2008, 2009, 2010, 2011, 2012, 2013)
dali <- rep(0, 7)
fly <- data.frame(years, dali)
fly$n = 0
for(i in 1:7) {
alive <- paste0('cohab_', years[i], '.dta')
default <- read_dta(alive)
default$cohort = years[i] - default$age_w
default$year <- years[i]
default <- default %>% select(educ_years_w, educ_years_h, educ_level_h, educ_level_w, cohort, race_h, race_w, income_h, income_w, year)
xx2 <- rbind(xx2, default)
}
###############
years <- 1962:2013
years <- years[-2]
for(i in 1:length(years)) {
alive <- paste0('CPS_', years[i], '.dta')
default <- read_dta(alive)
default$cohort = years[i] - default$age_w
default$year <- years[i]
default <- default %>% select(educ_years_w, educ_years_h, educ_level_h, educ_level_w, cohort, race_h, race_w, income_h, income_w, year)
xx2 <- rbind(xx2, default)
}
xx2$eduh <- getpc(xx2 %>% select(educ_level_h, educ_years_h), dofa=T, normalizeit=T, fillmissing=F)
xx2$eduw <- getpc(xx2 %>% select(educ_level_w, educ_years_w), dofa=T, normalizeit=T, fillmissing=F)
xx2$manrace <- NA
xx2$manrace[xx2$race_h==1] <- 'White'
xx2$manrace[xx2$race_h==2] <- 'Black'
xx2$manrace[xx2$race_h==3] <- 'Amerindian'
xx2$manrace[xx2$race_h==4] <- 'Chinese'
xx2$manrace[xx2$race_h==5] <- 'Japanese'
xx2$manrace[xx2$race_h==6] <- 'Other Asian'
xx2$manrace[is.na(xx2$manrace)] <- 'Unknown'
xx2$womanrace <- NA
xx2$womanrace[xx2$race_w==1] <- 'White'
xx2$womanrace[xx2$race_w==2] <- 'Black'
xx2$womanrace[xx2$race_w==3] <- 'Amerindian'
xx2$womanrace[xx2$race_w==4] <- 'Chinese'
xx2$womanrace[xx2$race_w==5] <- 'Japanese'
xx2$womanrace[xx2$race_w==6] <- 'Other Asian'
xx2$womanrace[is.na(xx2$womanrace)] <- 'Unknown'
xx3 <- na.omit(xx2 %>% select(womanrace, manrace, eduh, eduw, year, cohort))
lr <- lm(data = xx3, eduh ~ rcs(cohort, 15) + rcs(year, 15))
summary(lr)
Call:
lm(formula = eduh ~ rcs(cohort, 15) + rcs(year, 15), data = xx3)
Residuals:
Min 1Q Median 3Q Max
-3.6364 -0.5786 -0.0494 0.8793 3.2581
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.012e+01 2.099e-01 -191.171 < 2e-16 ***
rcs(cohort, 15)cohort 1.539e-02 1.398e-04 110.055 < 2e-16 ***
rcs(cohort, 15)cohort' 5.710e-02 2.045e-03 27.920 < 2e-16 ***
rcs(cohort, 15)cohort'' -4.597e-01 2.131e-02 -21.574 < 2e-16 ***
rcs(cohort, 15)cohort''' 1.324e+00 6.849e-02 19.334 < 2e-16 ***
rcs(cohort, 15)cohort'''' -2.046e+00 1.459e-01 -14.022 < 2e-16 ***
rcs(cohort, 15)cohort''''' 1.459e+00 2.853e-01 5.114 3.15e-07 ***
rcs(cohort, 15)cohort'''''' -2.378e+00 5.627e-01 -4.226 2.38e-05 ***
rcs(cohort, 15)cohort''''''' 2.448e+00 8.768e-01 2.792 0.005240 **
rcs(cohort, 15)cohort'''''''' 4.110e+00 1.363e+00 3.015 0.002573 **
rcs(cohort, 15)cohort''''''''' -4.936e+00 1.383e+00 -3.570 0.000357 ***
rcs(cohort, 15)cohort'''''''''' -4.640e-01 9.586e-01 -0.484 0.628364
rcs(cohort, 15)cohort''''''''''' 2.007e+00 7.836e-01 2.562 0.010421 *
rcs(cohort, 15)cohort'''''''''''' -1.753e+00 5.175e-01 -3.388 0.000705 ***
rcs(cohort, 15)cohort''''''''''''' 4.910e-02 2.400e-01 0.205 0.837862
rcs(year, 15)year 4.998e-03 1.149e-04 43.508 < 2e-16 ***
rcs(year, 15)year' 4.912e-02 1.301e-03 37.750 < 2e-16 ***
rcs(year, 15)year'' -1.233e-01 4.201e-03 -29.357 < 2e-16 ***
rcs(year, 15)year''' 1.156e-02 6.502e-03 1.778 0.075341 .
rcs(year, 15)year'''' 3.837e-01 1.070e-02 35.859 < 2e-16 ***
rcs(year, 15)year''''' -1.109e+00 3.642e-02 -30.454 < 2e-16 ***
rcs(year, 15)year'''''' 2.303e+00 5.259e-01 4.379 1.19e-05 ***
rcs(year, 15)year''''''' 1.144e+01 2.156e+00 5.309 1.10e-07 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9357 on 11982690 degrees of freedom
Multiple R-squared: 0.1165, Adjusted R-squared: 0.1165
F-statistic: 7.18e+04 on 22 and 11982690 DF, p-value: < 2.2e-16
xx3$educorh <- normalise(lr$residuals)
lr <- lm(data = xx3, eduw ~ rcs(cohort, 15) + rcs(year, 15))
summary(lr)
Call:
lm(formula = eduw ~ rcs(cohort, 15) + rcs(year, 15), data = xx3)
Residuals:
Min 1Q Median 3Q Max
-3.9953 -0.5530 -0.0546 0.7690 3.4835
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.406e+01 2.050e-01 -214.968 < 2e-16 ***
rcs(cohort, 15)cohort 1.694e-02 1.366e-04 124.044 < 2e-16 ***
rcs(cohort, 15)cohort' -1.527e-02 1.997e-03 -7.642 2.13e-14 ***
rcs(cohort, 15)cohort'' 1.389e-02 2.081e-02 0.668 0.504382
rcs(cohort, 15)cohort''' 5.221e-01 6.689e-02 7.805 5.94e-15 ***
rcs(cohort, 15)cohort'''' -1.304e+00 1.425e-01 -9.147 < 2e-16 ***
rcs(cohort, 15)cohort''''' 7.386e-01 2.787e-01 2.650 0.008048 **
rcs(cohort, 15)cohort'''''' -9.179e-01 5.496e-01 -1.670 0.094907 .
rcs(cohort, 15)cohort''''''' -9.622e-01 8.564e-01 -1.123 0.261234
rcs(cohort, 15)cohort'''''''' 1.040e+01 1.332e+00 7.812 5.64e-15 ***
rcs(cohort, 15)cohort''''''''' -9.881e+00 1.351e+00 -7.316 2.55e-13 ***
rcs(cohort, 15)cohort'''''''''' -9.352e-02 9.363e-01 -0.100 0.920439
rcs(cohort, 15)cohort''''''''''' 2.897e+00 7.654e-01 3.785 0.000154 ***
rcs(cohort, 15)cohort'''''''''''' -1.707e+00 5.055e-01 -3.376 0.000734 ***
rcs(cohort, 15)cohort''''''''''''' -8.381e-01 2.344e-01 -3.576 0.000349 ***
rcs(year, 15)year 5.499e-03 1.122e-04 49.011 < 2e-16 ***
rcs(year, 15)year' 2.750e-02 1.271e-03 21.633 < 2e-16 ***
rcs(year, 15)year'' -1.838e-02 4.103e-03 -4.479 7.48e-06 ***
rcs(year, 15)year''' -1.644e-01 6.351e-03 -25.893 < 2e-16 ***
rcs(year, 15)year'''' 4.938e-01 1.045e-02 47.243 < 2e-16 ***
rcs(year, 15)year''''' -9.988e-01 3.557e-02 -28.079 < 2e-16 ***
rcs(year, 15)year'''''' 1.012e+00 5.137e-01 1.970 0.048852 *
rcs(year, 15)year''''''' 1.305e+01 2.106e+00 6.197 5.75e-10 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.914 on 11982690 degrees of freedom
Multiple R-squared: 0.1489, Adjusted R-squared: 0.1489
F-statistic: 9.527e+04 on 22 and 11982690 DF, p-value: < 2.2e-16
xx3$educorw <- normalise(lr$residuals)
lr <- lm(data = xx3 %>% filter(manrace=='White'), educorh ~ as.factor(womanrace))
summary(lr)
Call:
lm(formula = educorh ~ as.factor(womanrace), data = xx3 %>% filter(manrace ==
"White"))
Residuals:
Min 1Q Median 3Q Max
-4.0624 -0.6466 -0.0882 0.8849 3.4081
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.034728 0.004526 -7.673 1.68e-14 ***
as.factor(womanrace)Black 0.183004 0.008648 21.161 < 2e-16 ***
as.factor(womanrace)Chinese 0.732115 0.008692 84.232 < 2e-16 ***
as.factor(womanrace)Japanese 0.526739 0.009023 58.374 < 2e-16 ***
as.factor(womanrace)Other Asian 0.368855 0.006265 58.875 < 2e-16 ***
as.factor(womanrace)Unknown 0.219408 0.005063 43.334 < 2e-16 ***
as.factor(womanrace)White 0.108539 0.004537 23.924 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.9529 on 9810947 degrees of freedom
Multiple R-squared: 0.001667, Adjusted R-squared: 0.001666
F-statistic: 2730 on 6 and 9810947 DF, p-value: < 2.2e-16
intermilan <- data.frame(beta=rep(0, 1), se=rep(0, 1))
intermilan$raceman <- NA
intermilan$racewoman <- NA
for(race in unique(xx3$manrace)) {
lr <- lm(data = xx3 %>% filter(manrace==race), educorh ~ as.factor(womanrace))
summary(lr)
intermilan <- rbind(intermilan, data.frame(beta = lr$coefficients, se = summary(lr)$coefficients[, "Std. Error"], raceman = race, racewoman = names(summary(lr)$coefficients[, 1])))
}
intermilan$racewoman <- gsub("as.factor\\(womanrace\\)", "", intermilan$racewoman)
intermilan$racewoman[intermilan$racewoman=='(Intercept)'] <- 'Amerindian'
intermilan$Upper_CI <- intermilan$beta + intermilan$se*1.96
intermilan$Lower_CI <- intermilan$beta - intermilan$se*1.96
intermilan <- intermilan[-1, ]
im2 <- intermilan %>%
group_by(raceman, racewoman)
im2$race <- paste0("Husband: ", im2$raceman, ', Wife: ', im2$racewoman)
im2 <- im2 %>% filter(!(raceman=='Unknown') & !(racewoman=='Unknown'))
p <- ggplot(im2, aes(x = beta, y = race)) +
geom_bar(stat = "identity", position = position_dodge(), color = "black") +
geom_errorbarh(aes(xmin = Lower_CI, xmax = Upper_CI), width = 0.2, position = position_dodge(.9)) +
scale_fill_brewer(palette = "Grey") +
labs(title = "Standardized educational attainment of husband by race of wife",
subtitle = 'Sample of 11,982,713 Americans sampled between 1940 and 2013, some are cohabiters',
x = "Educational Attainment",
y = "Race") +
theme_bw() +
theme(plot.title = element_text(size = 16),
plot.subtitle = element_text(size = 12),
axis.title = element_text(size = 14),
axis.text = element_text(size = 11),
legend.position = "none")
Warning: Ignoring unknown parameters: `width`Warning: Unknown palette: "Grey"
p
##########################################
intermilan <- data.frame(beta=rep(0, 1), se=rep(0, 1))
intermilan$raceman <- NA
intermilan$racewoman <- NA
for(race in unique(xx3$womanrace)) {
lr <- lm(data = xx3 %>% filter(womanrace==race), educorw ~ as.factor(manrace))
summary(lr)
intermilan <- rbind(intermilan, data.frame(beta = lr$coefficients, se = summary(lr)$coefficients[, "Std. Error"], racewoman = race, raceman = names(summary(lr)$coefficients[, 1])))
}
intermilan
intermilan$raceman <- gsub("as.factor\\(manrace\\)", "", intermilan$raceman)
intermilan$raceman[intermilan$raceman=='(Intercept)'] <- 'Amerindian'
intermilan$Upper_CI <- intermilan$beta + intermilan$se*1.96
intermilan$Lower_CI <- intermilan$beta - intermilan$se*1.96
intermilan <- intermilan[-1, ]
intermilan
im2 <- intermilan %>%
group_by(racewoman, raceman)
im2
im2$race <- paste0('Wife: ', im2$racewoman, ", Husband: ", im2$raceman)
im2 <- im2 %>% filter(!(raceman=='Unknown') & !(racewoman=='Unknown'))
im2
p <- ggplot(im2, aes(x = beta, y = race)) +
geom_bar(stat = "identity", position = position_dodge(), color = "black") +
geom_errorbarh(aes(xmin = Lower_CI, xmax = Upper_CI), width = 0.2, position = position_dodge(.9)) +
scale_fill_brewer(palette = "Grey") +
labs(title = "Standardized educational attainment of wife by race of husband",
subtitle = 'Sample of 11,982,713 Americans sampled between 1940 and 2013, some are cohabiters',
x = "Educational Attainment",
y = "Race") +
theme_bw() +
theme(plot.title = element_text(size = 16),
plot.subtitle = element_text(size = 12),
axis.title = element_text(size = 14),
axis.text = element_text(size = 11),
legend.position = "none")
Warning: Ignoring unknown parameters: `width`Warning: Unknown palette: "Grey"
p