Data Cleaning

# load data
setwd("C:/Users/monaj/Dropbox/CANB Lab/Lab Meeting Stuff/sheepskin")
data<-read.table("sub-data_3.txt", header=T, sep=",")
inflate<-read.csv("Inflation.csv")
raw<-merge(data, inflate, by="YEAR")

# load libraries
library(ggplot2)
library(car)
library(wesanderson)
library(sjPlot)

# adding additional factors
raw$Sheep<-ifelse(raw$DEGREE==0, 0,
                  ifelse(raw$DEGREE==1|raw$DEGREE==2, 1, 
                         ifelse(raw$DEGREE==3, 2,
                                ifelse(raw$DEGREE==4, 3, NA))))

raw$HS<-factor(ifelse(raw$DEGREE==0, "No", "Yes"))
raw$HS<-relevel(raw$HS, ref="No")
raw$Bach<-factor(ifelse(raw$DEGREE<3, "No", "Yes"))
raw$Bach<-relevel(raw$Bach, ref="No")
raw$Grad<-factor(ifelse(raw$DEGREE<4, "No", "Yes"))
raw$Grad<-relevel(raw$Grad, ref="No")

raw$Survey<-ifelse(raw$YEAR>=1977 & raw$YEAR<1982, "RINCOM77",
                   ifelse(raw$YEAR>=1982 & raw$YEAR<1986, "RINCOM82",
                          ifelse(raw$YEAR>=1986 & raw$YEAR<1991, "RINCOM86",
                                 ifelse(raw$YEAR>=1991 & raw$YEAR<1998, "RINCOM91",
                                        ifelse(raw$YEAR>=1998 & raw$YEAR<2006, "RINCOM98",
                                               ifelse(raw$YEAR>=2006 & raw$YEAR<2016, "RINCOM06", 
                                                      ifelse(raw$YEAR>=201, "CONRINC", "RINCOME")))))))

# recoding data
raw$SEX<-car::recode(raw$SEX, "1='Male'; 2='Female'")
raw$AGE<-car::recode(raw$AGE, "98=NA; 99=NA")
raw$RACE<-car::recode(raw$RACE, "0=NA; 1='White'; 2='Black'; 3='Other'")
raw$EDUC<-car::recode(raw$EDUC, "97=NA; 98=NA; 99=NA")
raw$DEGREE<-car::recode(raw$DEGREE, "0='LT HS'; 1='HS'; 2='Jr College'; 3='Bachelor'; 4='Graduate'; 7=NA; 8=NA; 9=NA")

# average for income category taken
raw$RINCOME<-car::recode(raw$RINCOME, "0=NA;
                   1=500;
                   2=1999.5;
                   3=3499.5;
                   4=4499.5;
                   5=5499.5;
                   6=6499.5;
                   7=7499.5;
                   8=8999.5;
                   9=12499.5;
                   10=17499.5;
                   11=22499.5;
                   12=25000;
                   13=NA;
                   98=NA;
                   99=NA
                   ")
raw$RINCOM77<-car::recode(raw$RINCOM77, "0=NA;
                     1=500;
                     2=1999.5;
                     3=3499.5;
                     4=4499.5;
                     5=5499.5;
                     6=6499.5;
                     7=7499.5;
                     8=8999.5;
                     9=11249.5;
                     10=13749.5;
                     11=16249.5;
                     12=18749.5;
                     13=21249.5;
                     14=23749.5;
                     15=37499.5;
                     16=50000;
                     17=NA;
                     98=NA;
                     99=NA
                     ")

raw$RINCOM82<-car::recode(raw$RINCOM82, "0=NA;
                     1=500;
                     2=1999.5;
                     3=3499.5;
                     4=4499.5;
                     5=5499.5;
                     6=6499.5;
                     7=7499.5;
                     8=8999.5;
                     9=11249.5;
                     10=13749.5;
                     11=16249.5;
                     12=18749.5;
                     13=21249.5;
                     14=23749.5;
                     15=29999.5;
                     16=42499.5;
                     17=50000;
                     18=NA;
                     98=NA;
                     99=NA
                     ")

raw$RINCOM86<-car::recode(raw$RINCOM86, "0=NA;
                     1=500;
                     2=1999.5;
                     3=3499.5;
                     4=4499.5;
                     5=5499.5;
                     6=6499.5;
                     7=7499.5;
                     8=8999.5;
                     9=11249.5;
                     10=13749.5;
                     11=16249.5;
                     12=18749.5;
                     13=21249.5;
                     14=23749.5;
                     15=27499.5;
                     16=32499.5;
                     17=37499.5;
                     18=44999.5;
                     19=54999.5;
                     20=60000;
                     21=NA;
                     98=NA;
                     99=NA
                     ")

raw$RINCOM91<-car::recode(raw$RINCOM91, "0=NA;
                     1=500;
                     2=1999.5;
                     3=3499.5;
                     4=4499.5;
                     5=5499.5;
                     6=6499.5;
                     7=7499.5;
                     8=8999.5;
                     9=11249.5;
                     10=13749.5;
                     11=16249.5;
                     12=18749.5;
                     13=21249.5;
                     14=23749.5;
                     15=27499.5;
                     16=32499.5;
                     17=37499.5;
                     18=44999.5;
                     19=54999.5;
                     20=67499.5;
                     21=75000;
                     22=NA;
                     98=NA;
                     99=NA
                     ")

raw$RINCOM98<-car::recode(raw$RINCOM98, "0=NA;
                     1=500;
                     2=1999.5;
                     3=3499.5;
                     4=4499.5;
                     5=5499.5;
                     6=6499.5;
                     7=7499.5;
                     8=8999.5;
                     9=11249.5;
                     10=13749.5;
                     11=16249.5;
                     12=18749.5;
                     13=21249.5;
                     14=23749.5;
                     15=27499.5;
                     16=32499.5;
                     17=37499.5;
                     18=44999.5;
                     19=54999.5;
                     20=67499.5;
                     21=82499.5;
                     22=99999.5;
                     23=110000;
                     24=NA;
                     98=NA;
                     99=NA
                     ")

raw$RINCOM06<-car::recode(raw$RINCOM06, "0=NA;
                     1=500;
                     2=1999.5;
                     3=3499.5;
                     4=4499.5;
                     5=5499.5;
                     6=6499.5;
                     7=7499.5;
                     8=8999.5;
                     9=11249.5;
                     10=13749.5;
                     11=16249.5;
                     12=18749.5;
                     13=21249.5;
                     14=23749.5;
                     15=27499.5;
                     16=32499.5;
                     17=37499.5;
                     18=44999.5;
                     19=54999.5;
                     20=67499.5;
                     21=82499.5;
                     22=99999.5;
                     23=119999.5;
                     24=139999.5;
                     25=150000;
                     26=NA;
                     98=NA;
                     99=NA
                     ")

raw$CONRINC<-car::recode(raw$CONRINC, "0=NA")

# subsetting only people who provided their degree and income information

sub<-raw[complete.cases(raw$DEGREE) & rowSums(is.na(raw[c(8:15)])) != ncol(raw[c(8:15)]), ]

# creating a different income variable
sub$Income<-NA

for (i in 1:nrow(sub)){
  sub$Income[i] <-ifelse (!is.na(sub$CONRINC[i]), sub[i,c(which(names(sub) %in% sub$Survey[i]))], NA)
}

# calculating inflation
sub$Income.Adj<-sub$Income*sub$Inflation

Examining DV (Inflation-Adjusted Income)

Inflation-adjusted income examines how much income is worth when dollar amounts are equated for how much $1 has as of June 2018 (i.e., spending power)

ggplot(sub, aes(YEAR, Income.Adj))+
  geom_point()+
  geom_smooth()

Given that the data from 1972-1979 has significant leverage, I will be truncating the data to include only the data for 1980 onwards

df<-subset(sub, YEAR>=1980)

ggplot(df, aes(YEAR, Income.Adj))+
  geom_point()+
  geom_smooth()

Analysis

Separate Coding For Each Degree

For these models, each degree was coded independent of each other and added in one by one into each model

# Education only model
model<-lm(Income.Adj~EDUC, data=df)

coef<-data.frame(Model="Edu", Variable=c("Intercept", "Education"), Coef=coef(model),confint(model))
rownames(coef)<-1:nrow(coef)
names(coef)[c((ncol(coef)-1):ncol(coef))]<-c("LowerBound", "UpperBound")

# Education + HS Degree
model.hs<-lm(Income.Adj~EDUC+HS, data=df)

coef.hs<-data.frame(Model="Edu+HS",Variable=c("Intercept", "Education", "HS Degree"), Coef=coef(model.hs),confint(model.hs))
rownames(coef.hs)<-1:nrow(coef.hs)
names(coef.hs)[c((ncol(coef.hs)-1):ncol(coef.hs))]<-c("LowerBound", "UpperBound")

# Education + HS Degree 
model.hsBa<-lm(Income.Adj~EDUC+HS+Bach, data=df)

coef.hsBa<-data.frame(Model="Edu+HS+BA",Variable=c("Intercept", "Education", "HS Degree", "Bachlor Degree"),
                    Coef=coef(model.hsBa),confint(model.hsBa))
rownames(coef.hsBa)<-1:nrow(coef.hsBa)
names(coef.hsBa)[c((ncol(coef.hsBa)-1):ncol(coef.hsBa))]<-c("LowerBound", "UpperBound")

# Education + HS Degree + Bachelor + Graduate
model.hsBaGr<-lm(Income.Adj~EDUC+HS+Bach+Grad, data=df)

coef.hsBaGr<-data.frame(Model="Edu+HS+BA+GR",
                        Variable=c("Intercept", "Education", "HS Degree", "Bachlor Degree", "Graduate Degree"),
                    Coef=coef(model.hsBaGr),confint(model.hsBaGr))
rownames(coef.hsBaGr)<-1:nrow(coef.hsBaGr)
names(coef.hsBaGr)[c((ncol(coef.hsBaGr)-1):ncol(coef.hsBaGr))]<-c("LowerBound", "UpperBound")

# putting all the models together for graphing

models.all<-rbind(coef, coef.hs, coef.hsBa, coef.hsBaGr)
models.all$Variable<-factor(models.all$Variable, levels(models.all$Variable)[c(5,4,3,1,2)])

Coeficients are graphed below. The model codings are as follows:

Edu = Years of education HS = High school degree BA = Bachelor’s degree GR = Graduate Degree

# Graphing Estimates

ggplot(models.all, aes(x=Variable, y=Coef, colour = Model))+
  geom_hline(yintercept = 0, colour = "maroon", linetype="dotted", lwd=1)+
  geom_pointrange(aes(ymin=LowerBound, 
                      ymax=UpperBound), 
                     lwd = 1, position = position_dodge(width = 1/2), shape = 16, fatten = 1.5)+
  coord_flip()+
  theme(
    panel.background= element_rect(fill=NA), # transparent panel
    plot.background = element_rect(fill=NA, colour=NA), #transparent background
    panel.grid=element_blank(), # remove panel grid
    axis.ticks.x=element_blank(), # remove tick marks on x-axis
    axis.ticks=element_line(colour="gray70"), # change colour of tick marks
    panel.border = element_rect(fill="transparent", colour="gray70"), # change panel border colour
    legend.background = element_rect(fill = "transparent", colour = "transparent"), # change legend background
    axis.text = element_text(color="#432620"),
    legend.key = element_rect(fill = "transparent", colour = "transparent")
    # legend.position="none"
  )+
  scale_colour_manual(values=c(wes_palette("GrandBudapest1", n=4)))

Education + Number of Degrees

Rather than have each degree coded independently, the total number of degrees that individuals had were assessed

# Education only model
model<-lm(Income.Adj~EDUC, data=df)

coef<-data.frame(Model="Edu", Variable=c("Intercept", "Education"), Coef=coef(model),confint(model))
rownames(coef)<-1:nrow(coef)
names(coef)[c((ncol(coef)-1):ncol(coef))]<-c("LowerBound", "UpperBound")

# Education + number of degrees
model.sheep<-lm(Income.Adj~EDUC+Sheep, data=df)

coef.sheep<-data.frame(Model="Edu+NumDegrees", Variable=c("Intercept", "Education", "Number of Degrees"), 
                       Coef=coef(model.sheep),confint(model.sheep))
rownames(coef.sheep)<-1:nrow(coef.sheep)
names(coef.sheep)[c((ncol(coef.sheep)-1):ncol(coef.sheep))]<-c("LowerBound", "UpperBound")

# putting all the models together for graphing

models.all<-rbind(coef, coef.sheep)
models.all$Variable<-factor(models.all$Variable, levels(models.all$Variable)[c(3,1,2)])

Coding are as follows:

Edu = Years of education NumDegree = Number of degrees (high school, bachelor’s, grad)

# Graphing Estimates

ggplot(models.all, aes(x=Variable, y=Coef, colour = Model))+
  geom_hline(yintercept = 0, colour = "maroon", linetype="dotted", lwd=1)+
  geom_pointrange(aes(ymin=LowerBound, 
                      ymax=UpperBound), 
                     lwd = 1, position = position_dodge(width = 1/2), shape = 16, fatten = 1.5)+
  coord_flip()+
  theme(
    panel.background= element_rect(fill=NA), # transparent panel
    plot.background = element_rect(fill=NA, colour=NA), #transparent background
    panel.grid=element_blank(), # remove panel grid
    axis.ticks.x=element_blank(), # remove tick marks on x-axis
    axis.ticks=element_line(colour="gray70"), # change colour of tick marks
    panel.border = element_rect(fill="transparent", colour="gray70"), # change panel border colour
    legend.background = element_rect(fill = "transparent", colour = "transparent"), # change legend background
    axis.text = element_text(color="#432620"),
    legend.key = element_rect(fill = "transparent", colour = "transparent")
    # legend.position="none"
  )+
  scale_colour_manual(values=c(wes_palette("GrandBudapest1", n=4)))

Does the value of degrees change with time?

# centering variables
df$Year.cen<-df$YEAR-mean(df$YEAR)

model.year.num<-lm(Income.Adj~EDUC+Year.cen*Sheep, data=df)

Summary of model:

summary(model.year.num)
## 
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * Sheep, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -79580 -21450  -5758  14790 177839 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     8819.52    1241.03   7.107 1.21e-12 ***
## EDUC            1297.59     123.28  10.526  < 2e-16 ***
## Year.cen        -227.53      31.99  -7.114 1.15e-12 ***
## Sheep          13279.50     451.24  29.429  < 2e-16 ***
## Year.cen:Sheep   194.22      21.65   8.972  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31970 on 32021 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.1591, Adjusted R-squared:  0.1589 
## F-statistic:  1514 on 4 and 32021 DF,  p-value: < 2.2e-16
plot_model(model.year.num, type = "int")

  • Number of degrees positively predicts income (more degrees, more income)
  • Year the survey was administered negatively predicts income, adjusted for inflation!

  • Significant interaction of the number of degrees with year; the number of degrees determined the rate at which infleation-adjusted income drops

Going to try and break down this effect by examining which degree is most significantly affeting this

Does the value of a high school degree change with time?

model.year.hs<-lm(Income.Adj~EDUC+Year.cen*HS, data=df)
summary(model.year.hs)
## 
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * HS, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -72576 -22182  -5931  14969 164431 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    -17270.68     892.09 -19.360  < 2e-16 ***
## EDUC             4709.20      77.00  61.161  < 2e-16 ***
## Year.cen         -119.53      49.89  -2.396  0.01658 *  
## HSYes           -4106.51     694.86  -5.910 3.46e-09 ***
## Year.cen:HSYes    167.33      53.34   3.137  0.00171 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32430 on 32021 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.1349, Adjusted R-squared:  0.1348 
## F-statistic:  1249 on 4 and 32021 DF,  p-value: < 2.2e-16
plot_model(model.year.hs, type = "int")

  • The gap in income between having a HS degree vs without decreases with time

Does the value of a Bachelor’s degree change with time?

model.year.bach<-lm(Income.Adj~EDUC+Year.cen*Bach, data=df)
summary(model.year.bach)
## 
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * Bach, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -73911 -21670  -5887  14780 165752 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       3731.89    1180.31   3.162  0.00157 ** 
## EDUC              2576.99      94.32  27.321  < 2e-16 ***
## Year.cen           -57.57      20.41  -2.821  0.00479 ** 
## BachYes          15736.14     604.66  26.025  < 2e-16 ***
## Year.cen:BachYes   299.45      39.00   7.678 1.67e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32070 on 32021 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.154,  Adjusted R-squared:  0.1539 
## F-statistic:  1457 on 4 and 32021 DF,  p-value: < 2.2e-16
plot_model(model.year.bach, type = "int")

  • The gap in income increases with time between people with and without a Bachelor’s degree

Does the value of a graduate degree change with time?

model.year.grad<-lm(Income.Adj~EDUC+Year.cen*Grad, data=df)
summary(model.year.grad)
## 
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * Grad, data = df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -83068 -22009  -6064  14955 164219 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -5999.244   1006.256  -5.962 2.52e-09 ***
## EDUC              3503.031     75.313  46.513  < 2e-16 ***
## Year.cen             1.453     18.371   0.079  0.93695    
## GradYes          16399.025    747.186  21.948  < 2e-16 ***
## Year.cen:GradYes   224.203     59.374   3.776  0.00016 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32180 on 32021 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.1479, Adjusted R-squared:  0.1478 
## F-statistic:  1389 on 4 and 32021 DF,  p-value: < 2.2e-16
plot_model(model.year.grad, type = "int")