library(haven)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(ggplot2)
library(broom)

ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "ownershp"  "ownershpd"
## [11] "bedrooms"  "pernum"    "perwt"     "famsize"   "nchild"   
## [16] "nchlt5"    "eldch"     "nsibs"     "relate"    "related"  
## [21] "sex"       "age"       "marst"     "birthyr"   "fertyr"   
## [26] "race"      "raced"     "hispan"    "hispand"   "bpl"      
## [31] "bpld"      "citizen"   "yrsusa1"   "language"  "languaged"
## [36] "speakeng"  "educ"      "educd"     "empstat"   "empstatd" 
## [41] "labforce"  "occ"       "ind"       "inctot"    "incwage"  
## [46] "poverty"   "hwsei"     "migrate1"  "migrate1d" "carpool"  
## [51] "trantime"

Create dataset with one continuous outcome variable, one continuous predictor, and 5 other predictors

mypums <- ipums%>%
  filter(labforce==2, age>-18, incwage>0)%>%
  mutate(mywage= ifelse(incwage%in%c(999998, 999999), NA, incwage))%>%
  mutate(mytrantime=  ifelse(trantime==000, NA, trantime))%>%
  mutate(mysexcode= ifelse(sex==1, "male", "female"))%>%
  mutate(myeducd = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs",
                            .$educd %in% c(65:100)~"somecoll",
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd ==999 ~ "missing"))%>%
  mutate(race_eth = case_when(.$hispan %in% c(1:4) & .$race %in%c(1:9) ~ "hispanic", 
                          .$hispan ==0 & .$race==1 ~"0nh_white",
                         .$hispan ==0 & .$race==2 ~"nh_black",
                         .$hispan ==0 & .$race%in%c(3,7,8,9) ~"nh_other",
                         .$hispan ==0 & .$race%in%c(4:6) ~"nh_asian",
                          .$hispan==9 ~ "missing"))%>%
  mutate(mybpl= case_when(.$bpl %in%c (120:999)~"foreign born", .$bpl %in%c (1:119)~"US born"))%>%
  mutate(mypoverty= case_when(.$poverty %in%c (1:100)~"poverty", .$poverty %in%c (101:501)~"not poverty", .$poverty %in%c (0:1)~"NA"))

Estimate at least two nested OLS regression models for my outcome

mod1 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode,  data=mypums)
mod2 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth, data=mypums)
mod3 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode +  race_eth + mypoverty, data=mypums)

anova(mod1, mod2, mod3)
## Analysis of Variance Table
## 
## Model 1: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode
## Model 2: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth
## Model 3: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth + mypoverty
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1 122092 125076                                   
## 2 122088 124459  4     617.1  173.64 < 2.2e-16 ***
## 3 122086 108470  2   15988.5 8997.73 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Evaluate the assumptions of the model

plot(mod3, which=1)

plot(mod3, which=2)

library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
bptest(mod3)
## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 1521.1, df = 12, p-value < 2.2e-16
ks.test(resid(mod3), y=pnorm)
## Warning in ks.test(resid(mod3), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  resid(mod3)
## D = 0.098746, p-value < 2.2e-16
## alternative hypothesis: two-sided

Compare models to one another using both the F test and the AIC

mod1 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode,  data=mypums)
mod2 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth, data=mypums)
mod3 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode +  race_eth + mypoverty, data=mypums)

anova(mod1, mod2, mod3)
## Analysis of Variance Table
## 
## Model 1: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode
## Model 2: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth
## Model 3: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth + mypoverty
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1 122092 125076                                   
## 2 122088 124459  4     617.1  173.64 < 2.2e-16 ***
## 3 122086 108470  2   15988.5 8997.73 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AICs<-AIC(mod1, mod2, mod3)
AICs$diff <- AICs$AIC-AICs$AIC[1]
AICs
##      df      AIC        diff
## mod1  8 349459.3      0.0000
## mod2 12 348863.4   -595.9006
## mod3 14 332079.0 -17380.3198

Discuss the results of your final model (best fitting)

For my model, I used Incwage as my continuous outcome variable and trantime as my continuous predictor. Other predictors I used in my model were education, sex, race/ethnicity, poverty and birthplace. I created three nested OLS regression models and conducted an ANOVA on all three models. Model 3 was the best model because of the F test (8997.73) and the small P-value. I evaluated the assumptions of the model by plotting (residuals vs. fitted and normal Q-Q) and conducting the Breusch-Pagan test and the Kolmogorov-Smirnov tests. I finally compared the models to one another using the F test and the AIC (AIC value on model 3 was lower than the others) and concluded that Model 3 was the best fitting of the 3 that were used.

library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer
stargazer(mod1, mod2, mod3, type = "html", style= "demography", ci= T)
log(mywage)
Model 1 Model 2 Model 3
scale(mytrantime) 0.324*** 0.334*** 0.265***
(0.310, 0.338) (0.320, 0.348) (0.252, 0.278)
scale(I(mytrantime2)) -0.218*** -0.225*** -0.172***
(-0.232, -0.204) (-0.238, -0.211) (-0.185, -0.159)
myeducdhs -0.800*** -0.785*** -0.705***
(-0.815, -0.785) (-0.801, -0.770) (-0.719, -0.691)
myeducdnohs -1.362*** -1.323*** -1.177***
(-1.384, -1.341) (-1.345, -1.300) (-1.198, -1.156)
myeducdsomecoll -0.671*** -0.659*** -0.565***
(-0.685, -0.657) (-0.673, -0.645) (-0.578, -0.552)
mysexcodemale 0.413*** 0.407*** 0.378***
(0.402, 0.425) (0.395, 0.418) (0.367, 0.388)
race_ethhispanic -0.107*** -0.051***
(-0.124, -0.090) (-0.067, -0.034)
race_ethnh_asian -0.086*** -0.043***
(-0.111, -0.061) (-0.067, -0.020)
race_ethnh_black -0.210*** -0.129***
(-0.230, -0.190) (-0.147, -0.110)
race_ethnh_other -0.207*** -0.136***
(-0.244, -0.170) (-0.170, -0.102)
mypovertynot poverty 1.709***
(1.660, 1.758)
mypovertypoverty 0.347***
(0.294, 0.401)
Constant 10.634*** 10.670*** 8.997***
(10.623, 10.645) (10.659, 10.682) (8.947, 9.048)
N 122,099 122,099 122,099
R2 0.197 0.201 0.303
Adjusted R2 0.197 0.201 0.303
Residual Std. Error 1.012 (df = 122092) 1.010 (df = 122088) 0.943 (df = 122086)
F Statistic 4,983.395*** (df = 6; 122092) 3,065.298*** (df = 10; 122088) 4,430.510*** (df = 12; 122086)
p < .05; p < .01; p < .001