library(haven)
## Warning: package 'haven' was built under R version 3.4.2
library(car)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.4.2
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
## `curl` package not installed, falling back to using `url()`
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "ownershp"  "ownershpd"
## [11] "bedrooms"  "pernum"    "perwt"     "famsize"   "nchild"   
## [16] "nchlt5"    "eldch"     "nsibs"     "relate"    "related"  
## [21] "sex"       "age"       "marst"     "birthyr"   "fertyr"   
## [26] "race"      "raced"     "hispan"    "hispand"   "bpl"      
## [31] "bpld"      "citizen"   "yrsusa1"   "language"  "languaged"
## [36] "speakeng"  "educ"      "educd"     "empstat"   "empstatd" 
## [41] "labforce"  "occ"       "ind"       "inctot"    "incwage"  
## [46] "poverty"   "hwsei"     "migrate1"  "migrate1d" "carpool"  
## [51] "trantime"

For this assignment, I selected wage as my continuous outcome variable. The continuous predictor is commute time and five additional predictors: 1. Sex 2. Race 3. Age 4. Marital status 5. Educational attainment

I selected only the population over the age of 18 who are in the labor force for this analysis.

newpums4<-ipums%>%
      filter(labforce==2, age>=18, incwage>0)%>%
      mutate(mytime= ifelse(trantime%in%c(000), NA, trantime),
      mywage= ifelse(incwage%in%c(999998,999999), NA,incwage),
      mysex=ifelse(sex==1, "male", "female"))%>%
mutate(myedu = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs",
                            .$educd %in% c(65:100)~"somecoll",
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd ==999 ~ "missing"))%>%
mutate(mymarital = case_when(.$marst%in% c(1)~"married spouse present", 
                            .$marst %in% c(2)~"married spouse absent",
                            .$marst %in% c(3)~"separated",
                            .$marst %in% c(4)~"divorced", 
                            .$marst %in% c(5)~"widowed", 
                            .$marst %in% c(6)~"never married"))%>%
mutate(myrace = case_when(.$race %in% c(1)~"white", 
                            .$race %in% c(2)~"black",
                            .$race %in% c(3)~"american indian", 
                            .$race %in% c(4:9) ~ "other"))

Analysis of Variance Table with Four Models

Each model increases in its complexity as we integrate additional variables. The first model includes commute time, age and sex. The second model, commute time, age, sex and race. The third model, includes age commute time, sex, race and marital status. Finally, the fourth model includes commute time, age, sex, race, marital status and education.

model1<-lm(log(mywage)~scale(age)+scale(I(age^2))+mytime+mysex, data=newpums4)
model2<-lm(log(mywage)~scale(age)+scale(I(age^2))+mytime+mysex+myrace, data=newpums4)
model3<-lm(log(mywage)~scale(age)+scale(I(age^2))+mytime+mysex+myrace+mymarital, data=newpums4)
model4<-lm(log(mywage)~scale(age)+scale(I(age^2))+mytime+mysex+myrace+mymarital+myedu, data=newpums4)

Anova Test

The Anova test provide evidence that all models are able to reject the null hypothesis. However, more clarification is needed to determine which model is the most reliable.

anova(model1,model2, model3, model4)
## Analysis of Variance Table
## 
## Model 1: log(mywage) ~ scale(age) + scale(I(age^2)) + mytime + mysex
## Model 2: log(mywage) ~ scale(age) + scale(I(age^2)) + mytime + mysex + 
##     myrace
## Model 3: log(mywage) ~ scale(age) + scale(I(age^2)) + mytime + mysex + 
##     myrace + mymarital
## Model 4: log(mywage) ~ scale(age) + scale(I(age^2)) + mytime + mysex + 
##     myrace + mymarital + myedu
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1 120755 111822                                   
## 2 120752 110971  3     851.1  356.21 < 2.2e-16 ***
## 3 120747 109416  5    1554.9  390.43 < 2.2e-16 ***
## 4 120744  96170  3   13246.0 5543.55 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AIC Model

This test shows that the model that provides the best fit is model #4 with an AIC of 17. However, Model 3 also provides a reliable AIC value of 14. In this case, it would be best to consider the best and less complicated model so selecting Model #3 wound be a better choice.

AICs<-AIC(model1, model2, model3, model4)
AICs$diff<-AICs$AIC-AICs$AIC[1]
AICs
##        df      AIC       diff
## model1  6 333428.5      0.000
## model2  9 332511.8   -916.692
## model3 14 330817.8  -2610.683
## model4 17 315241.0 -18187.473

Nested Models

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer
stargazer(model1, model2, model3, model4, type = "html", style = "demography", ci = T)
log(mywage)
Model 1 Model 2 Model 3 Model 4
scale(age) 2.413*** 2.414*** 2.159*** 1.989***
(2.381, 2.445) (2.381, 2.446) (2.124, 2.194) (1.955, 2.022)
scale(I(age2)) -2.172*** -2.178*** -1.974*** -1.804***
(-2.205, -2.140) (-2.210, -2.146) (-2.008, -1.940) (-1.837, -1.772)
mytime 0.004*** 0.004*** 0.004*** 0.004***
(0.004, 0.004) (0.004, 0.005) (0.004, 0.005) (0.004, 0.004)
mysexmale 0.372*** 0.364*** 0.352*** 0.402***
(0.362, 0.383) (0.354, 0.375) (0.342, 0.363) (0.392, 0.413)
myraceblack 0.024 0.053 -0.028
(-0.038, 0.086) (-0.009, 0.114) (-0.086, 0.029)
myraceother 0.190*** 0.169*** 0.050
(0.128, 0.251) (0.108, 0.230) (-0.007, 0.107)
myracewhite 0.290*** 0.258*** 0.113***
(0.230, 0.350) (0.199, 0.317) (0.057, 0.169)
mymaritalmarried spouse absent -0.067** -0.061**
(-0.110, -0.024) (-0.101, -0.020)
mymaritalmarried spouse present 0.171*** 0.091***
(0.153, 0.189) (0.074, 0.108)
mymaritalnever married -0.117*** -0.150***
(-0.138, -0.095) (-0.170, -0.130)
mymaritalseparated -0.142*** -0.062**
(-0.184, -0.099) (-0.102, -0.022)
mymaritalwidowed -0.032 -0.011
(-0.075, 0.011) (-0.051, 0.029)
myeduhs -0.711***
(-0.724, -0.697)
myedunohs -0.997***
(-1.018, -0.977)
myedusomecoll -0.549***
(-0.562, -0.537)
Constant 10.038*** 9.787*** 9.765*** 10.370***
(10.028, 10.048) (9.727, 9.847) (9.703, 9.826) (10.312, 10.428)
N 120,760 120,760 120,760 120,760
R2 0.219 0.225 0.236 0.329
Adjusted R2 0.219 0.225 0.236 0.329
Residual Std. Error 0.962 (df = 120755) 0.959 (df = 120752) 0.952 (df = 120747) 0.892 (df = 120744)
F Statistic 8,484.913*** (df = 4; 120755) 5,017.898*** (df = 7; 120752) 3,111.570*** (df = 12; 120747) 3,940.754*** (df = 15; 120744)
p < .05; p < .01; p < .001