library("knitr")
## set up your working directly like this so that it applies to all chunks
opts_knit$set(root.dir= ("C:/Users/aleaw/OneDrive/Desktop/PhD Fall 2020/TA_402/Week 13"))
knitr::opts_chunk$set(echo=TRUE)
knitr::opts_chunk$set(include=TRUE)

ANES <- read_dta("ANES2016.dta")
ANES <- recode(ANES,"-99:-1=NA; 101:1000=NA")

Common Control Variables

Recoding most of our variables:

ANES = within(ANES,
                  {
                    pid3 <-
                      recode(as.numeric(V161158x),
                             "1:3 = 'Democrat';
                             4 = 'Independent';
                             5:7 = 'Republican'",
                             as.factor=TRUE)
                   
                     partyscale <-
                        ifelse(V161158x<1, NA, V161158x)
                    
                    gender <-
                      recode(as.numeric(V161342), 
                             "1 = 'Male'; 2 = 'Female'; else=NA")
                    
                    agecat <-   
                      recode(as.numeric(V161267),
                                   "17:29 = '18-29';
                                    30:44 = '30-44';
                                    45:65 = '45-65';
                                    64:120 = '65+';
                                    else=NA", as.factor=TRUE)
                     
                    agecont <-   ifelse(V161267<1, NA, V161267)
                                  
                    race <- recode(as.numeric(V161310x),
                                   "1 = 'White';
                                   2 = 'Black';
                                   5 = 'Hisp';
                                   3:4 = 'Other/Multiple races';
                                   6='Other/Multiple races'")
                    ideol <-
                      ifelse(V161126 <1, NA, V161126)
                       
                    bachelors <- recode(as.numeric(V161270), 
                                      "1:12 = 'Not BA';
                                      13:16 = 'BA';
                                      else=NA")
                    educat <- 
                      recode(as.numeric(V161270),
                       "1:8 ='Some HS';
                        9 = 'HS Grad';
                        10:12 = 'Some College';
                        13 = 'BA';
                        14 = 'Master';
                        15 = 'MD,DDS, etc.';
                        16 = 'PhD';
                        else=NA") 
                    
                    educont <-
                      ifelse(V161270>16, NA, V161270)

                    income <- ifelse(V161361x <1, NA, V161361x)
                      })

Control Variable Summary:

pid3: Democrats=1, Independents=2, Republicans=3, categorical
partyscale: 7 point scale from 1=strong democrat to 7=strong republican; ordinal
gender: Male and female; dichotomous
agecat: Age broken up into 4 groups; categorical
agecont: Age with coding from code book; continuous
race: Race broken down into only 4 categories ideol: 7 point scale where 1=extremely liberal and 7=extremely conservative; ordinal
bachelors: Education variable for having 4 years of college or more; dichotomous (either 1=yes bachelors or 0=no bachelors degree)
educont: Education with a range of 1 to 16 as it was in the code book; ordinal
educat:Education with 7 categories; ordinal
income: kept coding used in code book; ordinal

Bivariate Regression: Income and Education

What is the effect of education on income?
H0: No relationship between education and income
H1: People with higher education are more likely to have higher incomes
DV = Income
IV = Education

summary(lm(income~educont, data=ANES))

## 
## Call:
## lm(formula = income ~ educont, data = ANES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.2214  -5.4091   0.7786   5.9839  19.8051 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.20688    0.57048  -0.363    0.717    
## educont      1.40177    0.05005  28.008   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.379 on 4036 degrees of freedom
##   (232 observations deleted due to missingness)
## Multiple R-squared:  0.1627, Adjusted R-squared:  0.1625 
## F-statistic: 784.5 on 1 and 4036 DF,  p-value: < 2.2e-16

Intercept: the predicted outcome when the independent variable is 0. In this case the predicted income for someone with 0 education -0.21.
Education coefficient: There is a positive and significant effect of education on income.
For every additional unit of education, income increases by 1.4 units (p < .001).
R-squared: 16.27% of the variation in income is explained by education.

If someone has 12 units of education, then plug 12 into the equation

y = -0.21 + 1.4(0) = -0.21
y = -0.21 + 1.4(12) = 16.59 units of income => somewhere between 55K to 65K

# graphing correlation between education and income just for fun
ggplot(data=ANES, mapping=aes(x=educont, y=income))+geom_point(position="jitter")+ geom_smooth(method="lm", se=TRUE)

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 232 rows containing non-finite values (stat_smooth).

## Warning: Removed 232 rows containing missing values (geom_point).

So if education doesn’t directly predict income, then what does?

We can add more variables!

Multivariate Regression

Additional Resources and Examples:
https://bookdown.org/ejvanholm/Textbook/multiple-regression.html
https://moderndive.netlify.app/6-2-model3.html

Predictor variables can be ANY level of measurement - nominal, ordinal, interval, or ratio.

General Syntax:
model <- lm(DV ~ IV+ IV + IV, data=df) summary(model)

What is the effect of education on income when controlling for gender?

summary(lm(income~educont+gender, data=ANES))

## 
## Call:
## lm(formula = income ~ educont + gender, data = ANES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.2660  -5.0857   0.5419   5.9143  18.7510 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.12901    0.57620  -1.959   0.0501 .  
## educont      1.40113    0.04966  28.214   <2e-16 ***
## genderMale   1.97690    0.23102   8.557   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.31 on 4014 degrees of freedom
##   (253 observations deleted due to missingness)
## Multiple R-squared:  0.1781, Adjusted R-squared:  0.1777 
## F-statistic: 434.9 on 2 and 4014 DF,  p-value: < 2.2e-16

Intercept: The predicted income for females with 0 units of education.

y = -1.13 + 1.4(0) + 1.98(0) = -1.13 units of income

Education coefficient: There is a positive and significant effect of education on income when controlling for gender. For every additional unit of education, income increases by 1.4 units (p < .001) when gender is held constant.

Male coefficient: There is a positive and significant effect of being male on one’s income when controlling for the effect of education on income (p < 0.001). On average, males make 1.98 units of income more than females when controlling for the effect of education on income.

R-squared: 17.8 of the variation in income is explained by our model.
Our Adjusted R-squared: 17.77 of the variation explained

On average, how much is a male with a master’s degree predicted to make?

y = -1.13 + 1.98(male=1) + 1.40(masters=14)
male w/ master’s degree = -1.13 + 1.98 + 19.6 = 20.45 …. 20.45 what? You must look at how the variable is coded!
20 is code for 75K to 79K
21 is code for 80K to 89K

On average, males with a master’s degree are expected to make somewhere between 75K - 90K.

What is the effect of education on income when controlling for gender and race?

summary(lm(income~educont+gender+race, data=ANES))

## 
## Call:
## lm(formula = income ~ educont + gender + race, data = ANES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.4732  -5.1621   0.7245   5.5402  17.4887 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -3.73559    0.64909  -5.755 9.31e-09 ***
## educont                   1.33016    0.04999  26.610  < 2e-16 ***
## genderMale                1.88663    0.22873   8.248  < 2e-16 ***
## raceHisp                  2.66747    0.51173   5.213 1.96e-07 ***
## raceOther/Multiple races  3.28897    0.54782   6.004 2.10e-09 ***
## raceWhite                 4.03967    0.39589  10.204  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.213 on 3995 degrees of freedom
##   (269 observations deleted due to missingness)
## Multiple R-squared:  0.2001, Adjusted R-squared:  0.1991 
## F-statistic: 199.9 on 5 and 3995 DF,  p-value: < 2.2e-16

y = -3.74 + 1.33(educont) + 1.33(Male) + 1.88(Hisp) + 3.29(Multiple/Other) + 4.04(White)

Intercept: The comparison group in this example is black women with 0 units of education. On average, Black women with 0 units of education are expected to make -3.74 units of income.

y = -3.74 + 1.33(0) + 1.33(0) + 1.88(0) + 3.29(0) + 4.04(0)

Education Coefficient Education has a positive and significant affect on income. A one unit increase in education is associated with a 1.33 unit increase in income when holding gender and race constant.

Gender Coefficient Being male has a positive and significant effect on income. On averages, males make 1.89 units of income more than females when controlling for education and race.

Race Coefficient Each race has its own coefficient. It is a dummy variable, you either are or are not an option. If you are white, then you would put a 1 in white coefficient part and everything else is a 0 and drops out. If you are Hispanic, you would use the 2.67 coefficient and every other option for race would be 0. For this example, everything is relative to being Black because that is our comparison group.

R-squared: About 20% of the variation in income is explained by our model.

Note: The characteristics used for the comparison group is partly up to the researcher but can also be based on the norms within the field being researched.

What is the effect of education on income when controlling for gender, race, and age?

summary(lm(income~educont+gender+race+agecat, data=ANES))

## 
## Call:
## lm(formula = income ~ educont + gender + race + agecat, data = ANES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.9237  -4.7675   0.6363   5.3946  18.6029 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              -5.29008    0.67635  -7.822 6.65e-15 ***
## educont                   1.29537    0.04976  26.030  < 2e-16 ***
## genderMale                1.89162    0.22674   8.343  < 2e-16 ***
## raceHisp                  2.79223    0.50713   5.506 3.91e-08 ***
## raceOther/Multiple races  3.16045    0.54458   5.804 7.01e-09 ***
## raceWhite                 4.10391    0.39263  10.452  < 2e-16 ***
## agecat30-44               2.49241    0.36057   6.912 5.54e-12 ***
## agecat45-65               3.06020    0.33768   9.062  < 2e-16 ***
## agecat65+                 0.51017    0.37914   1.346    0.179    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.095 on 3937 degrees of freedom
##   (324 observations deleted due to missingness)
## Multiple R-squared:  0.2267, Adjusted R-squared:  0.2252 
## F-statistic: 144.3 on 8 and 3937 DF,  p-value: < 2.2e-16

Intercept: The comparison group in this example is black women with 0 units of education between the ages of 17 and 29. On average, Black women with 0 units of education are expected to make -5.29 units of income. y = -5.29 + 1.29(0) + 1.89(0) + 2.79(0) + 3.16(0) + 4.10(0) + 2.49(0) + 3.06(0) + 0.51(0)

Education Coefficient Education has a positive and significant affect on income. A one unit increase in education is associated with a 1.30 unit increase in income when holding gender, race, and age constant.

Gender Coefficient Being male has a positive and significant effect on income. On averages, males make 1.89 units of income more than females when controlling for education, race, and age.

Race Coefficient Each race has its own coefficient. It is a dummy variable, you either are or are not an option. If you are white, then you would put a 1 in white coefficient part and use 4.10 while everything else is a 0 and drops out. If you are Hispanic, you would use the 2.79 coefficient and every other option for race would be 0. For this example, everything is relative to being Black because that is our comparison group.

Age Coefficient Also a dummy variable. You are either in an age group (1) or you are not (0).

The number of combinations has increased but here are a couple of examples:

Black males from the ages of 30-44 with 14 units of education
- -5.29 + 1.29(14) + 1.89(1) + 2.49(1) = 17.15 units of income
A white female over the age of 65 with a high school degree:
- -5.29 + 1.29(9) + 4.10(1) + .51(1) = 10.93 units of income but the age category of 65 and up is not statistically significant.

Excluding a Relevant Variable

summary(lm(income ~ gender + pid3 + bachelors, data=ANES))

## 
## Call:
## lm(formula = income ~ gender + pid3 + bachelors, data = ANES)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.1042  -5.4903   0.5097   5.8958  16.9055 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      17.6495     0.2449  72.058  < 2e-16 ***
## genderMale        1.6989     0.2354   7.216 6.39e-13 ***
## pid3Independent  -0.6969     0.3664  -1.902   0.0572 .  
## pid3Republican    1.7558     0.2531   6.937 4.65e-12 ***
## bachelorsNot BA  -5.8580     0.2423 -24.176  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.397 on 3995 degrees of freedom
##   (270 observations deleted due to missingness)
## Multiple R-squared:  0.1572, Adjusted R-squared:  0.1563 
## F-statistic: 186.2 on 4 and 3995 DF,  p-value: < 2.2e-16

Yes the model explains ~16% of the variation in income, but what is it leaving out?
How does that influence the interpretation?

Unfortunately, based on our previous models, one’s race seems to be an important predictor of income. This example starts to show the lasting inequalities embedded in America. That being said, even with our previous model that includes education, race, age, and gender, there is still around 80% of the variation in income that hasn’t been explained by our model. Other important variables could be geographic region, industry, and hours/weeks worked. (Even with all of those variables, race is still almost always a significant predictor of income).

More graphing practice: Visualizing average income by race
Note: You would want to create better titles, axis labels, etc.

ANES %>% 
  filter(!is.na(race)) %>%
  ggplot(aes(x=income, fill = race,  color=race, na.rm=TRUE)) +
  geom_density(adjust=3, alpha=.1, na.rm=TRUE ) + 
  ggtitle("Income by Race") +
  scale_x_continuous(name = "Income")+
  scale_y_continuous(name = "")+
  theme(legend.title = element_blank())

Multivariate regression Example 2

Are people who view conservatives highly also likely to view fundamentalists highly?

ANES$fundamentalists <- ANES$V162101
ANES$conservatives <- ANES$V162095
cor.test(ANES$fundamentalists , ANES$conservatives)

## 
##  Pearson's product-moment correlation
## 
## data:  ANES$fundamentalists and ANES$conservatives
## t = 39.058, df = 3494, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5277874 0.5739534
## sample estimates:
##       cor 
## 0.5512923

# graphing correlation between views of conservatives and fundamentalists just for fun
ggplot(data=ANES, mapping=aes(x=conservatives, y=fundamentalists))+geom_point(position="jitter")+ geom_smooth(method="lm", se=TRUE)

## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.
## Don't know how to automatically pick scale for object of type haven_labelled/vctrs_vctr/double. Defaulting to continuous.

## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 774 rows containing non-finite values (stat_smooth).

## Warning: Removed 774 rows containing missing values (geom_point).

What is the expected view of fundamentalists based on a respondent’s view of conservatives?

1 IV

summary(lm(fundamentalists~conservatives, data=ANES))

## 
## Call:
## lm(formula = fundamentalists ~ conservatives, data = ANES)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -80.879 -11.026  -1.014  13.999  68.876 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   31.12429    0.73076   42.59   <2e-16 ***
## conservatives  0.49754    0.01274   39.06   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.84 on 3494 degrees of freedom
##   (774 observations deleted due to missingness)
## Multiple R-squared:  0.3039, Adjusted R-squared:  0.3037 
## F-statistic:  1526 on 1 and 3494 DF,  p-value: < 2.2e-16

Intercept: On average, the expected value for fundamentalists for respondents that viewed conservatives as 0 of 100 on the feeling thermometer is 31.12.

Conservatives Coefficient There is a positive and significant association with the score assigned to conservatives and the score assigned to fundamentalists (p<0.001). A one unit increase in the score for conservatives is associated with a 0.50 increase in the feeling thermometer score for fundamentalists.

R-squared Our model explains 30.4% of the variation in the feeling thermometer scores of fundamentalists.

What is the expected score for fundamentalists for people who scored conservatives as 50 out of 100 on a feeling thermometer?

expected y = 31.12 + 0.50(conservatives)
intercept = 31.12
For those that gave a response of 50 out of 100 for conservatives, their predicted response for fundamentalists, on average, would be 56.12.

2 IVs

What is the expected view of fundamentalists based on one’s views of conservatives when controlling for political party?

summary(lm(fundamentalists~conservatives+pid3, data=ANES))

## 
## Call:
## lm(formula = fundamentalists ~ conservatives + pid3, data = ANES)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -85.385 -11.127   1.514  12.793  72.793 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     27.20674    0.69254  39.286  < 2e-16 ***
## conservatives    0.39326    0.01208  32.551  < 2e-16 ***
## pid3Independent  6.72375    1.00684   6.678 2.81e-11 ***
## pid3Republican  20.24544    0.71818  28.190  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.82 on 3479 degrees of freedom
##   (787 observations deleted due to missingness)
## Multiple R-squared:  0.4341, Adjusted R-squared:  0.4337 
## F-statistic: 889.8 on 3 and 3479 DF,  p-value: < 2.2e-16

Intercept: The predicted value of fundamentalists for Democrats that rated their view of conservatives as a 0 on a scale of 0 to 100. On average, a Democrat that gave a 0 for their opinions of conservatives is expected to give fundamentalists a score of 27.21.

y = 27.21 + 0.39(conservatives) + 6.72(independents) + 20.25(republicans)

Conservatives coefficient: Views of conservatives are a significant predictor of views of fundamentalists when controlling for the effect of political party (p < 0.001). On average, a 1 unit increase in views of conservatives is associated with a 0.39 unit increase in views of fundamentalists when controlling for effect of political party (p<0.001).

Political party coefficient: Political party is a significant predictor of views of fundamentalists when controlling for views of conservatives. On average, an independent will rate fundamentalists as 6.72 points higher than a democrat would (p<0.001), while a republican will rate fundamentalists 20.25 points higher than democrats do (p<0.001), on average, when controlling for views of conservatives.

R-Squared: This model explains 43.4% of the variation of feeling thermometer scores for fundamentalists.

What is the predicted view of fundamentalists for republicans that gave a response of 70 out of 100 for conservatives?
Expected y = 27.21+ 0.39(70) + 20.25(1)
= 74.76
On average, republicans who scored conservatives as 70 out of 100 on the feeling thermometer are predicted to give fundamentalists a score of 74.76 out of 100.

3 IVs

What is the expected view of fundamentalists based on the respondent’s views of conservatives when controlling for political party and religion?

ANES$religion <- recode(ANES$V161265x, "1=1;2=1;3=1;4=2;5=3;6:7=4;8=5; else=NA")
table(ANES$religion)

## 
##    1    2    3    4    5 
## 1421  937  674  313  877

ANES$religion <- recode(as.factor(ANES$religion), "1='Protestant'; 2='Catholics'; 3='Christian'; 4='Other'; 5='Not Religious'")
table(ANES$religion)

## 
##     Catholics     Christian Not Religious         Other    Protestant 
##           937           674           877           313          1421

summary(lm(fundamentalists~conservatives+pid3+religion, data=ANES))

## 
## Call:
## lm(formula = fundamentalists ~ conservatives + pid3 + religion, 
##     data = ANES)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -85.070 -11.161   1.243  12.480  74.220 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           29.53117    0.96310  30.663  < 2e-16 ***
## conservatives          0.38452    0.01292  29.763  < 2e-16 ***
## pid3Independent        6.91714    1.01828   6.793 1.29e-11 ***
## pid3Republican        20.04261    0.72326  27.711  < 2e-16 ***
## religionChristian     -2.07077    1.05706  -1.959 0.050196 .  
## religionNot Religious -3.75073    1.01002  -3.714 0.000208 ***
## religionOther         -2.80377    1.36631  -2.052 0.040238 *  
## religionProtestant    -1.57189    0.88648  -1.773 0.076286 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 18.79 on 3450 degrees of freedom
##   (812 observations deleted due to missingness)
## Multiple R-squared:  0.4381, Adjusted R-squared:  0.4369 
## F-statistic: 384.3 on 7 and 3450 DF,  p-value: < 2.2e-16

Intercept: The predicted view of fundamentalists for catholic democrats who view conservatives negatively (0 of 100). On average, a catholic democrat who gave conservatives a score of 0 on the feeling thermometer is expected to give a score of 29.53 for their view of fundamentalists.

expected y = 29.53 + 0.38(conservatives) + 6.92(Independent) + 20.04(Republican) - 2.07(Christian) - 3.75(NotReligious) - 2.80(Other) - 1.57(Protestant)

R-squared = 0.44; 44% of the variation in views of fundamentalists is explained by this model.

Conservatives coefficient:The feeling thermometer score for conservatives is a significant predictor of views of fundamentalists when controlling for political party and religion. For every 1 unit of increase in views of conservatives, views of fundamentalists are expected to increase by 0.38 (p < .001) when political party and religion is held constant.

Political party coefficient: Political party is a significant predictor of views of fundamentalists when controlling for views of conservatives and religion. On average, an independent will rate fundamentalists as 6.92 points higher than a democrat would (p<0.001), while a republican will rate fundamentalists 20.04 points higher than democrats do (p<0.001), on average, when controlling for views of conservatives and religion.

Religion coefficient: Religion is a significant predictor of views of fundamentalists when controlling for views of conservatives and political party. On average, someone who identifies as Christian will rate fundamentalists 2.07 points lower than a Catholic would, but that finding is borderline statistically significant (p=0.05). On average, people who identify as not religious will rate fundamentalists 3.75 points less than catholics do (p<0.001), on average, when controlling for views of conservatives and political party. etc. etc. do the same thing for Other religions and Protestants.

What is the predicted view of fundamentalists for someone that is a protestant and republican who reported a value of 80 for their opinion of conservatives?

y = 29.53 + 0.38(80) + 6.92(0) + 20.04(1) - 2.07(0) - 3.75(0) - 2.80(0) - 1.57(1)
y = 29.53 + 0.38(80) + 20.04(1) - 1.57(1)
The expected value for fundamentalists on a feeling thermometer, on average, is 78.4.

Options for Coding Control Variables

Two ways to code age

Age broken into 4 categories:

summary(lm(ANES$income~ANES$agecat))

## 
## Call:
## lm(formula = ANES$income ~ ANES$agecat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.6793  -5.6793   0.3207   6.8260  15.2043 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       12.7957     0.3123  40.976  < 2e-16 ***
## ANES$agecat30-44   3.3783     0.3989   8.470  < 2e-16 ***
## ANES$agecat45-65   3.8836     0.3732  10.407  < 2e-16 ***
## ANES$agecat65+     1.2587     0.4167   3.021  0.00254 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.937 on 4002 degrees of freedom
##   (264 observations deleted due to missingness)
## Multiple R-squared:  0.03433,    Adjusted R-squared:  0.0336 
## F-statistic: 47.42 on 3 and 4002 DF,  p-value: < 2.2e-16

Age as a continuous variable:

table(ANES$agecont)

## 
## 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 
## 28 39 54 53 43 56 55 63 70 70 60 60 69 78 86 73 81 79 69 85 66 75 51 66 67 51 
## 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 
## 46 70 83 66 53 67 59 72 59 81 78 78 88 76 94 96 89 80 69 72 75 69 88 78 94 55 
## 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 
## 54 49 53 38 45 34 41 26 29 21 24 24 24 12 13 12 14  9 12  6 27

summary(lm(ANES$income~ANES$agecont))

## 
## Call:
## lm(formula = ANES$income ~ ANES$agecont)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -14.755  -6.249   0.474   6.783  12.904 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  14.931209   0.379683  39.325   <2e-16 ***
## ANES$agecont  0.009150   0.007268   1.259    0.208    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.073 on 4004 degrees of freedom
##   (264 observations deleted due to missingness)
## Multiple R-squared:  0.0003957,  Adjusted R-squared:  0.000146 
## F-statistic: 1.585 on 1 and 4004 DF,  p-value: 0.2081

Age as a categorical variable (r^2=.0343) has a higher r-squared than age as a continuous variable(r^2=0.0003957). Why?
Income only increases with age for so long, so it might not actually be linear. It probably looks like an arch. This would suggest that age as a categorical variable may be more appropriate for our research question.

Four ways to code education

We’ve talked about how there are multiple ways a variable can be coded. Depending on the research topic, there may be reasons why one way is better than another way. Below is education coded a categorical variable, a continuous variable, and then a dichotomous categorical variable.

# categorical, lists the categorical alphabetically
table(ANES$educat)

## 
##           BA      HS Grad       Master MD,DDS, etc.          PhD Some College 
##          955          810          499           88           93         1499 
##      Some HS 
##          282

# ordinal/continuous, lower educational achievement -> higher educational achievement
table(ANES$educont)

## 
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16 
##   1   3  15  22  32  40  62 107 810 898 313 288 955 499  88  93

# dichotomous categorical for having a bachelor's degree (or higher) or not having a bachelor's degree
table(ANES$bachelors)

## 
##     BA Not BA 
##   1635   2591

Which should we use?

It depends on the research question and relevant literature on the topic.
We looked at the relationship of education and income last week
Note: It is not truly continuous, but it is the most continuous of the options and was named educont because of that.

#continuous
summary(lm(ANES$income~ANES$educont))

## 
## Call:
## lm(formula = ANES$income ~ ANES$educont)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.2214  -5.4091   0.7786   5.9839  19.8051 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.20688    0.57048  -0.363    0.717    
## ANES$educont  1.40177    0.05005  28.008   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.379 on 4036 degrees of freedom
##   (232 observations deleted due to missingness)
## Multiple R-squared:  0.1627, Adjusted R-squared:  0.1625 
## F-statistic: 784.5 on 1 and 4036 DF,  p-value: < 2.2e-16

Education as an ordinal (almost continuous) variable explains 16.25% of the variation in income.
Again, education is not truly continuous because a one unit increase does not correspond with a one year of education increase for all the options. It does indicate direction, therefore it is ordinal.

What if we want to have education grouped into fewer categories? How does that impact our model?

#ordinal
ANES$educatordered <- recode(as.numeric(ANES$V161270), "1:8 = 1; 9 = 2; 10:12 = 3; 13 = 4; 14 = 5; 15:16 = 6; else=NA")
table(ANES$educatordered)

## 
##    1    2    3    4    5    6 
##  282  810 1499  955  499  181

summary(lm(ANES$income~ANES$educatordered))

## 
## Call:
## lm(formula = ANES$income ~ ANES$educatordered)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -21.8203  -5.4318   0.5682   5.8739  18.6509 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.65489    0.33369   19.94   <2e-16 ***
## ANES$educatordered  2.69424    0.09598   28.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.376 on 4036 degrees of freedom
##   (232 observations deleted due to missingness)
## Multiple R-squared:  0.1633, Adjusted R-squared:  0.1631 
## F-statistic:   788 on 1 and 4036 DF,  p-value: < 2.2e-16

This increased our R-squared a little. This model now explains 16.3% of the variation in income. For every 1 unit increase in education, income increases by 2.69 units. Someone with 0 units of education has an expected income of 6.65 units.

ANES$educatlabels <- recode(as.numeric(ANES$educatordered), "1 ='Some HS'; 2 = 'HS Grad'; 3 = 'Some College'; 4 = 'BA'; 5 = 'Master'; 6 = 'More than a Masters'; else=NA")
table(ANES$educatlabels)

## 
##                  BA             HS Grad              Master More than a Masters 
##                 955                 810                 499                 181 
##        Some College             Some HS 
##                1499                 282

This model, with education in ordered categories ranging from 1 to 6, explains 16.3% of the variation income.

#categorical (but grouped everything above a Masters degree into one category)
summary(lm(ANES$income~ANES$educatlabels))

## 
## Call:
## lm(formula = ANES$income ~ ANES$educatlabels)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -20.1965  -5.3749   0.6251   5.9736  18.9549 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           18.0264     0.2443  73.789  < 2e-16 ***
## ANES$educatlabelsHS Grad              -5.7486     0.3593 -16.001  < 2e-16 ***
## ANES$educatlabelsMaster                2.4468     0.4192   5.837 5.73e-09 ***
## ANES$educatlabelsMore than a Masters   3.1701     0.6107   5.191 2.19e-07 ***
## ANES$educatlabelsSome College         -3.6515     0.3118 -11.710  < 2e-16 ***
## ANES$educatlabelsSome HS              -8.9813     0.5132 -17.500  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.361 on 4032 degrees of freedom
##   (232 observations deleted due to missingness)
## Multiple R-squared:  0.1675, Adjusted R-squared:  0.1665 
## F-statistic: 162.3 on 5 and 4032 DF,  p-value: < 2.2e-16

This model, with education as a categorical variable, explains 16.75% of the variation in income. Have a bachelor’s degree is the comparison group. Grouping anything above a masters degree slightly improved the model. Apparently doing a PhD isn’t worth it….

#dichotomous
summary(lm(ANES$income~ANES$bachelors))

## 
## Call:
## lm(formula = ANES$income ~ ANES$bachelors)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -18.1189  -6.1189   0.8522   5.8811  14.8522 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           19.1189     0.1912   99.99   <2e-16 ***
## ANES$bachelorsNot BA  -5.9711     0.2435  -24.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.523 on 4036 degrees of freedom
##   (232 observations deleted due to missingness)
## Multiple R-squared:  0.1297, Adjusted R-squared:  0.1295 
## F-statistic: 601.3 on 1 and 4036 DF,  p-value: < 2.2e-16

In this model, the r-squared = .1297, so about 13% of the variation in income is explained by having a bachelor’s degree or not having a bachelor’s degree.

Depending on your research question, some variations of coding of the same variable may be better to use than others.

Lab Practice

1. Opinions on scientists by political party

Test if views of scientists (V162112) depend on political party (V161019).

What is the Null Hypothesis?
What is the alternative hypothesis? What are the DV and IVs?
What happens if we use political party as a 7pt scale from Strong Democrat to Strong Republican (V161158x) instead of political party as a categorical variable? Which one is “better”?
Interpret the intercept, the coefficient and the r squared.
What is the effect of political party on views of scientists when controlling for education?
Interpret the intercept, the coefficient and the r squared.
What is the effect of political party on views of scientists when controlling for education and age?
Interpret the intercept, the coefficient and the r squared.

2. Did people who dislike liberals like Trump?

Test whether people’s opinion of Trump (V161087) depends on their views of liberals (V162097).

What is the Null Hypothesis?
What is the alternative hypothesis? What are the DV and IVs?
Interpret the intercept, the coefficient and the r squared.
Then add political party as a control variable. Interpret the intercept, the coefficient and the r squared.
Then add having a bachelors degree as a control variable.
Interpret the intercept, the coefficient and the r squared.

Week 13 Lab: Multivariate Regression