# load data
setwd("C:/Users/monaj/Dropbox/CANB Lab/Lab Meeting Stuff/sheepskin")
data<-read.table("sub-data_3.txt", header=T, sep=",")
inflate<-read.csv("Inflation.csv")
raw<-merge(data, inflate, by="YEAR")
# load libraries
library(ggplot2)
library(car)
library(wesanderson)
library(sjPlot)
# adding additional factors
raw$Sheep<-ifelse(raw$DEGREE==0, 0,
ifelse(raw$DEGREE==1|raw$DEGREE==2, 1,
ifelse(raw$DEGREE==3, 2,
ifelse(raw$DEGREE==4, 3, NA))))
raw$HS<-factor(ifelse(raw$DEGREE==0, "No", "Yes"))
raw$HS<-relevel(raw$HS, ref="No")
raw$Bach<-factor(ifelse(raw$DEGREE<3, "No", "Yes"))
raw$Bach<-relevel(raw$Bach, ref="No")
raw$Grad<-factor(ifelse(raw$DEGREE<4, "No", "Yes"))
raw$Grad<-relevel(raw$Grad, ref="No")
raw$Survey<-ifelse(raw$YEAR>=1977 & raw$YEAR<1982, "RINCOM77",
ifelse(raw$YEAR>=1982 & raw$YEAR<1986, "RINCOM82",
ifelse(raw$YEAR>=1986 & raw$YEAR<1991, "RINCOM86",
ifelse(raw$YEAR>=1991 & raw$YEAR<1998, "RINCOM91",
ifelse(raw$YEAR>=1998 & raw$YEAR<2006, "RINCOM98",
ifelse(raw$YEAR>=2006 & raw$YEAR<2016, "RINCOM06",
ifelse(raw$YEAR>=201, "CONRINC", "RINCOME")))))))
# recoding data
raw$SEX<-car::recode(raw$SEX, "1='Male'; 2='Female'")
raw$AGE<-car::recode(raw$AGE, "98=NA; 99=NA")
raw$RACE<-car::recode(raw$RACE, "0=NA; 1='White'; 2='Black'; 3='Other'")
raw$EDUC<-car::recode(raw$EDUC, "97=NA; 98=NA; 99=NA")
raw$DEGREE<-car::recode(raw$DEGREE, "0='LT HS'; 1='HS'; 2='Jr College'; 3='Bachelor'; 4='Graduate'; 7=NA; 8=NA; 9=NA")
# average for income category taken
raw$RINCOME<-car::recode(raw$RINCOME, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=12499.5;
10=17499.5;
11=22499.5;
12=25000;
13=NA;
98=NA;
99=NA
")
raw$RINCOM77<-car::recode(raw$RINCOM77, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=11249.5;
10=13749.5;
11=16249.5;
12=18749.5;
13=21249.5;
14=23749.5;
15=37499.5;
16=50000;
17=NA;
98=NA;
99=NA
")
raw$RINCOM82<-car::recode(raw$RINCOM82, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=11249.5;
10=13749.5;
11=16249.5;
12=18749.5;
13=21249.5;
14=23749.5;
15=29999.5;
16=42499.5;
17=50000;
18=NA;
98=NA;
99=NA
")
raw$RINCOM86<-car::recode(raw$RINCOM86, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=11249.5;
10=13749.5;
11=16249.5;
12=18749.5;
13=21249.5;
14=23749.5;
15=27499.5;
16=32499.5;
17=37499.5;
18=44999.5;
19=54999.5;
20=60000;
21=NA;
98=NA;
99=NA
")
raw$RINCOM91<-car::recode(raw$RINCOM91, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=11249.5;
10=13749.5;
11=16249.5;
12=18749.5;
13=21249.5;
14=23749.5;
15=27499.5;
16=32499.5;
17=37499.5;
18=44999.5;
19=54999.5;
20=67499.5;
21=75000;
22=NA;
98=NA;
99=NA
")
raw$RINCOM98<-car::recode(raw$RINCOM98, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=11249.5;
10=13749.5;
11=16249.5;
12=18749.5;
13=21249.5;
14=23749.5;
15=27499.5;
16=32499.5;
17=37499.5;
18=44999.5;
19=54999.5;
20=67499.5;
21=82499.5;
22=99999.5;
23=110000;
24=NA;
98=NA;
99=NA
")
raw$RINCOM06<-car::recode(raw$RINCOM06, "0=NA;
1=500;
2=1999.5;
3=3499.5;
4=4499.5;
5=5499.5;
6=6499.5;
7=7499.5;
8=8999.5;
9=11249.5;
10=13749.5;
11=16249.5;
12=18749.5;
13=21249.5;
14=23749.5;
15=27499.5;
16=32499.5;
17=37499.5;
18=44999.5;
19=54999.5;
20=67499.5;
21=82499.5;
22=99999.5;
23=119999.5;
24=139999.5;
25=150000;
26=NA;
98=NA;
99=NA
")
raw$CONRINC<-car::recode(raw$CONRINC, "0=NA")
# subsetting only people who provided their degree and income information
sub<-raw[complete.cases(raw$DEGREE) & rowSums(is.na(raw[c(8:15)])) != ncol(raw[c(8:15)]), ]
# creating a different income variable
sub$Income<-NA
for (i in 1:nrow(sub)){
sub$Income[i] <-ifelse (!is.na(sub$CONRINC[i]), sub[i,c(which(names(sub) %in% sub$Survey[i]))], NA)
}
# calculating inflation
sub$Income.Adj<-sub$Income*sub$InflationInflation-adjusted income examines how much income is worth when dollar amounts are equated for how much $1 has as of June 2018 (i.e., spending power)
ggplot(sub, aes(YEAR, Income.Adj))+
geom_point()+
geom_smooth()Given that the data from 1972-1979 has significant leverage, I will be truncating the data to include only the data for 1980 onwards
df<-subset(sub, YEAR>=1980)
ggplot(df, aes(YEAR, Income.Adj))+
geom_point()+
geom_smooth()For these models, each degree was coded independent of each other and added in one by one into each model
# Education only model
model<-lm(Income.Adj~EDUC, data=df)
coef<-data.frame(Model="Edu", Variable=c("Intercept", "Education"), Coef=coef(model),confint(model))
rownames(coef)<-1:nrow(coef)
names(coef)[c((ncol(coef)-1):ncol(coef))]<-c("LowerBound", "UpperBound")
# Education + HS Degree
model.hs<-lm(Income.Adj~EDUC+HS, data=df)
coef.hs<-data.frame(Model="Edu+HS",Variable=c("Intercept", "Education", "HS Degree"), Coef=coef(model.hs),confint(model.hs))
rownames(coef.hs)<-1:nrow(coef.hs)
names(coef.hs)[c((ncol(coef.hs)-1):ncol(coef.hs))]<-c("LowerBound", "UpperBound")
# Education + HS Degree
model.hsBa<-lm(Income.Adj~EDUC+HS+Bach, data=df)
coef.hsBa<-data.frame(Model="Edu+HS+BA",Variable=c("Intercept", "Education", "HS Degree", "Bachlor Degree"),
Coef=coef(model.hsBa),confint(model.hsBa))
rownames(coef.hsBa)<-1:nrow(coef.hsBa)
names(coef.hsBa)[c((ncol(coef.hsBa)-1):ncol(coef.hsBa))]<-c("LowerBound", "UpperBound")
# Education + HS Degree + Bachelor + Graduate
model.hsBaGr<-lm(Income.Adj~EDUC+HS+Bach+Grad, data=df)
coef.hsBaGr<-data.frame(Model="Edu+HS+BA+GR",
Variable=c("Intercept", "Education", "HS Degree", "Bachlor Degree", "Graduate Degree"),
Coef=coef(model.hsBaGr),confint(model.hsBaGr))
rownames(coef.hsBaGr)<-1:nrow(coef.hsBaGr)
names(coef.hsBaGr)[c((ncol(coef.hsBaGr)-1):ncol(coef.hsBaGr))]<-c("LowerBound", "UpperBound")
# putting all the models together for graphing
models.all<-rbind(coef, coef.hs, coef.hsBa, coef.hsBaGr)
models.all$Variable<-factor(models.all$Variable, levels(models.all$Variable)[c(5,4,3,1,2)])Coeficients are graphed below. The model codings are as follows:
Edu = Years of education HS = High school degree BA = Bachelor’s degree GR = Graduate Degree
# Graphing Estimates
ggplot(models.all, aes(x=Variable, y=Coef, colour = Model))+
geom_hline(yintercept = 0, colour = "maroon", linetype="dotted", lwd=1)+
geom_pointrange(aes(ymin=LowerBound,
ymax=UpperBound),
lwd = 1, position = position_dodge(width = 1/2), shape = 16, fatten = 1.5)+
coord_flip()+
theme(
panel.background= element_rect(fill=NA), # transparent panel
plot.background = element_rect(fill=NA, colour=NA), #transparent background
panel.grid=element_blank(), # remove panel grid
axis.ticks.x=element_blank(), # remove tick marks on x-axis
axis.ticks=element_line(colour="gray70"), # change colour of tick marks
panel.border = element_rect(fill="transparent", colour="gray70"), # change panel border colour
legend.background = element_rect(fill = "transparent", colour = "transparent"), # change legend background
axis.text = element_text(color="#432620"),
legend.key = element_rect(fill = "transparent", colour = "transparent")
# legend.position="none"
)+
scale_colour_manual(values=c(wes_palette("GrandBudapest1", n=4)))Rather than have each degree coded independently, the total number of degrees that individuals had were assessed
# Education only model
model<-lm(Income.Adj~EDUC, data=df)
coef<-data.frame(Model="Edu", Variable=c("Intercept", "Education"), Coef=coef(model),confint(model))
rownames(coef)<-1:nrow(coef)
names(coef)[c((ncol(coef)-1):ncol(coef))]<-c("LowerBound", "UpperBound")
# Education + number of degrees
model.sheep<-lm(Income.Adj~EDUC+Sheep, data=df)
coef.sheep<-data.frame(Model="Edu+NumDegrees", Variable=c("Intercept", "Education", "Number of Degrees"),
Coef=coef(model.sheep),confint(model.sheep))
rownames(coef.sheep)<-1:nrow(coef.sheep)
names(coef.sheep)[c((ncol(coef.sheep)-1):ncol(coef.sheep))]<-c("LowerBound", "UpperBound")
# putting all the models together for graphing
models.all<-rbind(coef, coef.sheep)
models.all$Variable<-factor(models.all$Variable, levels(models.all$Variable)[c(3,1,2)])Coding are as follows:
Edu = Years of education NumDegree = Number of degrees (high school, bachelor’s, grad)
# Graphing Estimates
ggplot(models.all, aes(x=Variable, y=Coef, colour = Model))+
geom_hline(yintercept = 0, colour = "maroon", linetype="dotted", lwd=1)+
geom_pointrange(aes(ymin=LowerBound,
ymax=UpperBound),
lwd = 1, position = position_dodge(width = 1/2), shape = 16, fatten = 1.5)+
coord_flip()+
theme(
panel.background= element_rect(fill=NA), # transparent panel
plot.background = element_rect(fill=NA, colour=NA), #transparent background
panel.grid=element_blank(), # remove panel grid
axis.ticks.x=element_blank(), # remove tick marks on x-axis
axis.ticks=element_line(colour="gray70"), # change colour of tick marks
panel.border = element_rect(fill="transparent", colour="gray70"), # change panel border colour
legend.background = element_rect(fill = "transparent", colour = "transparent"), # change legend background
axis.text = element_text(color="#432620"),
legend.key = element_rect(fill = "transparent", colour = "transparent")
# legend.position="none"
)+
scale_colour_manual(values=c(wes_palette("GrandBudapest1", n=4)))# centering variables
df$Year.cen<-df$YEAR-mean(df$YEAR)
model.year.num<-lm(Income.Adj~EDUC+Year.cen*Sheep, data=df)Summary of model:
summary(model.year.num)##
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * Sheep, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -79580 -21450 -5758 14790 177839
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8819.52 1241.03 7.107 1.21e-12 ***
## EDUC 1297.59 123.28 10.526 < 2e-16 ***
## Year.cen -227.53 31.99 -7.114 1.15e-12 ***
## Sheep 13279.50 451.24 29.429 < 2e-16 ***
## Year.cen:Sheep 194.22 21.65 8.972 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31970 on 32021 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.1591, Adjusted R-squared: 0.1589
## F-statistic: 1514 on 4 and 32021 DF, p-value: < 2.2e-16
plot_model(model.year.num, type = "int")Year the survey was administered negatively predicts income, adjusted for inflation!
Significant interaction of the number of degrees with year; the number of degrees determined the rate at which infleation-adjusted income drops
Going to try and break down this effect by examining which degree is most significantly affeting this
model.year.hs<-lm(Income.Adj~EDUC+Year.cen*HS, data=df)
summary(model.year.hs)##
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * HS, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -72576 -22182 -5931 14969 164431
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17270.68 892.09 -19.360 < 2e-16 ***
## EDUC 4709.20 77.00 61.161 < 2e-16 ***
## Year.cen -119.53 49.89 -2.396 0.01658 *
## HSYes -4106.51 694.86 -5.910 3.46e-09 ***
## Year.cen:HSYes 167.33 53.34 3.137 0.00171 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32430 on 32021 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.1349, Adjusted R-squared: 0.1348
## F-statistic: 1249 on 4 and 32021 DF, p-value: < 2.2e-16
plot_model(model.year.hs, type = "int")model.year.bach<-lm(Income.Adj~EDUC+Year.cen*Bach, data=df)
summary(model.year.bach)##
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * Bach, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -73911 -21670 -5887 14780 165752
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3731.89 1180.31 3.162 0.00157 **
## EDUC 2576.99 94.32 27.321 < 2e-16 ***
## Year.cen -57.57 20.41 -2.821 0.00479 **
## BachYes 15736.14 604.66 26.025 < 2e-16 ***
## Year.cen:BachYes 299.45 39.00 7.678 1.67e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32070 on 32021 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.154, Adjusted R-squared: 0.1539
## F-statistic: 1457 on 4 and 32021 DF, p-value: < 2.2e-16
plot_model(model.year.bach, type = "int")model.year.grad<-lm(Income.Adj~EDUC+Year.cen*Grad, data=df)
summary(model.year.grad)##
## Call:
## lm(formula = Income.Adj ~ EDUC + Year.cen * Grad, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -83068 -22009 -6064 14955 164219
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5999.244 1006.256 -5.962 2.52e-09 ***
## EDUC 3503.031 75.313 46.513 < 2e-16 ***
## Year.cen 1.453 18.371 0.079 0.93695
## GradYes 16399.025 747.186 21.948 < 2e-16 ***
## Year.cen:GradYes 224.203 59.374 3.776 0.00016 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 32180 on 32021 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.1479, Adjusted R-squared: 0.1478
## F-statistic: 1389 on 4 and 32021 DF, p-value: < 2.2e-16
plot_model(model.year.grad, type = "int")