Loading Packages

library(readr)
library(car)
library(broom)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(haven)
library(dplyr)
library(tidyr)

Using IPUMS data

ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")

1 & 2) Constructing Outcome and Predictor Variables

newpums<-ipums%>%
  filter(labforce==2, age>=18, incwage>0, educd!=999, race!=9, empstat>0)%>% #filtering out the data
  
  mutate(mywage= ifelse(incwage%in%c(999998,999999), NA,incwage), #outcome variable
         sexrecode=ifelse(sex==1, "male", "female"))%>% 
  mutate(sex_recode=ifelse(sexrecode=="male", 1, 0))%>% #predictor variable 2
                                                        #age is predictor variable 1
  
  mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs",
                            .$educd %in% c(65:100)~"somecoll",
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd ==999 ~ "missing"))%>%
  mutate(edu_rec=ifelse(edurec=="collgrad", 1,0))%>% #predictor variable 3
  
  mutate(race_eth = case_when(.$hispan %in% c(1:4) & .$race %in%c(1:9) ~ "hispanic", 
                          .$hispan ==0 & .$race==1 ~"0nh_white",
                         .$hispan ==0 & .$race==2 ~"nh_black",
                         .$hispan ==0 & .$race%in%c(3,7,8,9) ~"nh_other",
                         .$hispan ==0 & .$race%in%c(4:6) ~"nh_asian",
                          .$hispan==9 ~ "missing"))%>%
  mutate(race.eth=ifelse(race_eth=="0nh_white", 1, 0))%>% #predictor variable 4
  
  mutate(emp.stat=case_when (.$empstat== "1" ~ "Employed",
                            .$empstat== "2" ~ "Unemployed", 
                            .$empstat== "3" ~ "Not in labor force"))%>%
  mutate(emp_stat=ifelse(emp.stat=="Employed", 1, 0)) #predictor variable 5

3) Nested OLS Regression Models

Here we will fit three models.

The first model includes age, sex, and education. The second model adds race/ethnicity and the third model adds employment status to the model.

mod1<-lm(log(mywage)~scale(age)+scale(I(age^2))+scale(sex_recode)+scale(edu_rec), data=newpums)
mod2<-lm(log(mywage)~scale(age)+scale(I(age^2))+scale(sex_recode)+scale(edu_rec)+scale(race.eth), data=newpums)
mod3<-lm(log(mywage)~scale(age)+scale(I(age^2))+scale(sex_recode)+scale(edu_rec)+scale(race.eth)+scale(emp_stat), data=newpums)

Getting summary estimates

summary(mod1)
## 
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec), data = newpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.9589 -0.4174  0.1469  0.5912  4.5380 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.299786   0.002648 3890.03   <2e-16 ***
## scale(age)         2.304498   0.015552  148.18   <2e-16 ***
## scale(I(age^2))   -2.056210   0.015538 -132.34   <2e-16 ***
## scale(sex_recode)  0.206483   0.002649   77.94   <2e-16 ***
## scale(edu_rec)     0.337050   0.002670  126.22   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9624 on 132126 degrees of freedom
## Multiple R-squared:  0.2962, Adjusted R-squared:  0.2962 
## F-statistic: 1.39e+04 on 4 and 132126 DF,  p-value: < 2.2e-16
summary(mod2)
## 
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec) + scale(race.eth), data = newpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0001 -0.4105  0.1433  0.5903  4.5346 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.299786   0.002640 3901.17   <2e-16 ***
## scale(age)         2.310552   0.015509  148.98   <2e-16 ***
## scale(I(age^2))   -2.071752   0.015504 -133.63   <2e-16 ***
## scale(sex_recode)  0.204928   0.002642   77.55   <2e-16 ***
## scale(edu_rec)     0.329827   0.002676  123.28   <2e-16 ***
## scale(race.eth)    0.073750   0.002677   27.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9597 on 132125 degrees of freedom
## Multiple R-squared:  0.3003, Adjusted R-squared:  0.3002 
## F-statistic: 1.134e+04 on 5 and 132125 DF,  p-value: < 2.2e-16
summary(mod3)
## 
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec) + scale(race.eth) + scale(emp_stat), data = newpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0149 -0.4143  0.1293  0.5749  4.4712 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.299786   0.002586 3982.77   <2e-16 ***
## scale(age)         2.249140   0.015214  147.84   <2e-16 ***
## scale(I(age^2))   -2.023094   0.015200 -133.10   <2e-16 ***
## scale(sex_recode)  0.207318   0.002588   80.09   <2e-16 ***
## scale(edu_rec)     0.321156   0.002623  122.43   <2e-16 ***
## scale(race.eth)    0.067901   0.002624   25.88   <2e-16 ***
## scale(emp_stat)    0.194389   0.002601   74.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.94 on 132124 degrees of freedom
## Multiple R-squared:  0.3286, Adjusted R-squared:  0.3286 
## F-statistic: 1.078e+04 on 6 and 132124 DF,  p-value: < 2.2e-16

We see the estimates of the parameters, their associated standard errors, and the t-statistics for each, along with the calculated p-value of the test.

The t statistics is different than 0, which means there is an association.

The Multiple R-sqaured value on model 3 (0.3286) shows that, the predictor variables explain nearly 33% variability in the outcome variables, which is more than model 1 and model 2.

All the predictors are significant.

From the model 3, it can be seen that as the age increases, the income goes up.

4) Comparing Models using F-test and the AIC

F-test

anova(mod1, mod2, mod3)
## Analysis of Variance Table
## 
## Model 1: log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec)
## Model 2: log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec) + scale(race.eth)
## Model 3: log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec) + scale(race.eth) + scale(emp_stat)
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1 132126 122389                                   
## 2 132125 121691  1     698.9  790.88 < 2.2e-16 ***
## 3 132124 116754  1    4936.7 5586.54 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We can see the output of all three models together. The model 3 has lower residual sums of sqaures compared to model 1 and model 2, with 1 degree of freedom. The tiny p-value shows all the models are significant.

We can also deduct that adding the employment status to the model improved the model 3 from the other two.

How different are the models from one another?

AICs<-AIC(mod1, mod2, mod3)
AICs$diff<-AICs$AIC-AICs$AIC[1]
AICs
##      df      AIC       diff
## mod1  6 364864.2     0.0000
## mod2  7 364109.5  -754.6644
## mod3  8 358639.6 -6224.6121

An AIC score greater than 10 is good. In this case, the AIC scores for all three models are large numbers, far greater than 10.

The difference value of -6224.61 shows that model 3 fit is good; however, this model is only better than the other two models that have been considered here.

5) Evaluating the assumptions of the model

1. Plots

plot(mod3, which=1)

plot(mod3, which=2)

The Q-Q plot shows a fairly positive association between the variables.

2. Normality check through Kolmogorov-Smirnov test

ks.test(rstudent(mod3), y=pnorm)
## Warning in ks.test(rstudent(mod3), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  rstudent(mod3)
## D = 0.084737, p-value < 2.2e-16
## alternative hypothesis: two-sided

The K-S test says that, with a tiny p-value, there is a 8% difference between the empirical distribution.

3. Heteroskedascity check through the Breusch-Pagan test

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(mod3)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 2351.1, df = 6, p-value < 2.2e-16

The tiny p-value says that the model is heteroskedastic.

4. Correcting hetereoskedasticity

coeftest(mod3, vcov. = hccm(mod3, type = "hc0")) #hc0 is the White's statndard errors
## 
## t test of coefficients:
## 
##                     Estimate Std. Error  t value  Pr(>|t|)    
## (Intercept)       10.2997855  0.0025860 3982.879 < 2.2e-16 ***
## scale(age)         2.2491402  0.0189922  118.424 < 2.2e-16 ***
## scale(I(age^2))   -2.0230944  0.0194459 -104.037 < 2.2e-16 ***
## scale(sex_recode)  0.2073177  0.0025936   79.933 < 2.2e-16 ***
## scale(edu_rec)     0.3211558  0.0026434  121.495 < 2.2e-16 ***
## scale(race.eth)    0.0679009  0.0026136   25.980 < 2.2e-16 ***
## scale(emp_stat)    0.1943891  0.0036586   53.132 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comparing this output to the output from:

summary(mod3)
## 
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) + 
##     scale(edu_rec) + scale(race.eth) + scale(emp_stat), data = newpums)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.0149 -0.4143  0.1293  0.5749  4.4712 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       10.299786   0.002586 3982.77   <2e-16 ***
## scale(age)         2.249140   0.015214  147.84   <2e-16 ***
## scale(I(age^2))   -2.023094   0.015200 -133.10   <2e-16 ***
## scale(sex_recode)  0.207318   0.002588   80.09   <2e-16 ***
## scale(edu_rec)     0.321156   0.002623  122.43   <2e-16 ***
## scale(race.eth)    0.067901   0.002624   25.88   <2e-16 ***
## scale(emp_stat)    0.194389   0.002601   74.74   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.94 on 132124 degrees of freedom
## Multiple R-squared:  0.3286, Adjusted R-squared:  0.3286 
## F-statistic: 1.078e+04 on 6 and 132124 DF,  p-value: < 2.2e-16

It is seen the outputs are nearly identical even after constructing heteroskedascity consistent standard errors.

5) Results and Discussion

Summary Table

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer
stargazer(mod1, mod2, mod3, type = "html", style = "demography", ci = T)
log(mywage)
Model 1 Model 2 Model 3
scale(age) 2.304*** 2.311*** 2.249***
(2.274, 2.335) (2.280, 2.341) (2.219, 2.279)
scale(I(age2)) -2.056*** -2.072*** -2.023***
(-2.087, -2.026) (-2.102, -2.041) (-2.053, -1.993)
scale(sex_recode) 0.206*** 0.205*** 0.207***
(0.201, 0.212) (0.200, 0.210) (0.202, 0.212)
scale(edu_rec) 0.337*** 0.330*** 0.321***
(0.332, 0.342) (0.325, 0.335) (0.316, 0.326)
scale(race.eth) 0.074*** 0.068***
(0.069, 0.079) (0.063, 0.073)
scale(emp_stat) 0.194***
(0.189, 0.199)
Constant 10.300*** 10.300*** 10.300***
(10.295, 10.305) (10.295, 10.305) (10.295, 10.305)
N 132,131 132,131 132,131
R2 0.296 0.300 0.329
Adjusted R2 0.296 0.300 0.329
Residual Std. Error 0.962 (df = 132126) 0.960 (df = 132125) 0.940 (df = 132124)
F Statistic 13,904.010*** (df = 4; 132126) 11,338.760*** (df = 5; 132125) 10,779.510*** (df = 6; 132124)
p < .05; p < .01; p < .001

The adjusted r-sqaured value from the table shows that the predictor variables explain around 33 percent variablility in the income, which means that, the predictors are fairly affecting the outcome variable. It means, with age the income increases. The use of other predictors such as sex, education, race/ethnicity and employment status are also signigicantly affecting the outcome variable income. However, with all those additional variables, the income decreases because the number of people with all those characteristics are reducing the number of completese cases within the population.

Summary Plot

library(sjPlot)
sjp.lm(mod3, sort.est = F, Title = "Regression effects from IPUMS data - Wage as outcome", show.summary = T)

The plot sums up the table. The dummary age variable agr^2 stands out compared to the other predictor variables which are all positively correlated with the outcome variable.