library(haven)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(ggplot2)
library(broom)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names
## [1] "year" "datanum" "serial" "hhwt" "statefip"
## [6] "met2013" "puma" "gq" "ownershp" "ownershpd"
## [11] "bedrooms" "pernum" "perwt" "famsize" "nchild"
## [16] "nchlt5" "eldch" "nsibs" "relate" "related"
## [21] "sex" "age" "marst" "birthyr" "fertyr"
## [26] "race" "raced" "hispan" "hispand" "bpl"
## [31] "bpld" "citizen" "yrsusa1" "language" "languaged"
## [36] "speakeng" "educ" "educd" "empstat" "empstatd"
## [41] "labforce" "occ" "ind" "inctot" "incwage"
## [46] "poverty" "hwsei" "migrate1" "migrate1d" "carpool"
## [51] "trantime"
mypums <- ipums%>%
filter(labforce==2, age>-18, incwage>0)%>%
mutate(mywage= ifelse(incwage%in%c(999998, 999999), NA, incwage))%>%
mutate(mytrantime= ifelse(trantime==000, NA, trantime))%>%
mutate(mysexcode= ifelse(sex==1, "male", "female"))%>%
mutate(myeducd = case_when(.$educd %in% c(0:61)~"nohs",
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad",
.$educd ==999 ~ "missing"))%>%
mutate(race_eth = case_when(.$hispan %in% c(1:4) & .$race %in%c(1:9) ~ "hispanic",
.$hispan ==0 & .$race==1 ~"0nh_white",
.$hispan ==0 & .$race==2 ~"nh_black",
.$hispan ==0 & .$race%in%c(3,7,8,9) ~"nh_other",
.$hispan ==0 & .$race%in%c(4:6) ~"nh_asian",
.$hispan==9 ~ "missing"))%>%
mutate(mybpl= case_when(.$bpl %in%c (120:999)~"foreign born", .$bpl %in%c (1:119)~"US born"))%>%
mutate(mypoverty= case_when(.$poverty %in%c (1:100)~"poverty", .$poverty %in%c (101:501)~"not poverty", .$poverty %in%c (0:1)~"NA"))
mod1 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode, data=mypums)
mod2 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth, data=mypums)
mod3 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth + mypoverty, data=mypums)
anova(mod1, mod2, mod3)
## Analysis of Variance Table
##
## Model 1: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd +
## mysexcode
## Model 2: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd +
## mysexcode + race_eth
## Model 3: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd +
## mysexcode + race_eth + mypoverty
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 122092 125076
## 2 122088 124459 4 617.1 173.64 < 2.2e-16 ***
## 3 122086 108470 2 15988.5 8997.73 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(mod3, which=1)
plot(mod3, which=2)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(mod3)
##
## studentized Breusch-Pagan test
##
## data: mod3
## BP = 1521.1, df = 12, p-value < 2.2e-16
ks.test(resid(mod3), y=pnorm)
## Warning in ks.test(resid(mod3), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: resid(mod3)
## D = 0.098746, p-value < 2.2e-16
## alternative hypothesis: two-sided
mod1 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode, data=mypums)
mod2 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth, data=mypums)
mod3 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth + mypoverty, data=mypums)
anova(mod1, mod2, mod3)
## Analysis of Variance Table
##
## Model 1: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd +
## mysexcode
## Model 2: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd +
## mysexcode + race_eth
## Model 3: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd +
## mysexcode + race_eth + mypoverty
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 122092 125076
## 2 122088 124459 4 617.1 173.64 < 2.2e-16 ***
## 3 122086 108470 2 15988.5 8997.73 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AICs<-AIC(mod1, mod2, mod3)
AICs$diff <- AICs$AIC-AICs$AIC[1]
AICs
## df AIC diff
## mod1 8 349459.3 0.0000
## mod2 12 348863.4 -595.9006
## mod3 14 332079.0 -17380.3198
For my model, I used Incwage as my continuous outcome variable and trantime as my continuous predictor. Other predictors I used in my model were education, sex, race/ethnicity, poverty and birthplace. I created three nested OLS regression models and conducted an ANOVA on all three models. Model 3 was the best model because of the F test (8997.73) and the small P-value. I evaluated the assumptions of the model by plotting (residuals vs. fitted and normal Q-Q) and conducting the Breusch-Pagan test and the Kolmogorov-Smirnov tests. I finally compared the models to one another using the F test and the AIC (AIC value on model 3 was lower than the others) and concluded that Model 3 was the best fitting of the 3 that were used.
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2. http://CRAN.R-project.org/package=stargazer
stargazer(mod1, mod2, mod3, type = "html", style= "demography", ci= T)
| log(mywage) | |||
| Model 1 | Model 2 | Model 3 | |
| scale(mytrantime) | 0.324*** | 0.334*** | 0.265*** |
| (0.310, 0.338) | (0.320, 0.348) | (0.252, 0.278) | |
| scale(I(mytrantime2)) | -0.218*** | -0.225*** | -0.172*** |
| (-0.232, -0.204) | (-0.238, -0.211) | (-0.185, -0.159) | |
| myeducdhs | -0.800*** | -0.785*** | -0.705*** |
| (-0.815, -0.785) | (-0.801, -0.770) | (-0.719, -0.691) | |
| myeducdnohs | -1.362*** | -1.323*** | -1.177*** |
| (-1.384, -1.341) | (-1.345, -1.300) | (-1.198, -1.156) | |
| myeducdsomecoll | -0.671*** | -0.659*** | -0.565*** |
| (-0.685, -0.657) | (-0.673, -0.645) | (-0.578, -0.552) | |
| mysexcodemale | 0.413*** | 0.407*** | 0.378*** |
| (0.402, 0.425) | (0.395, 0.418) | (0.367, 0.388) | |
| race_ethhispanic | -0.107*** | -0.051*** | |
| (-0.124, -0.090) | (-0.067, -0.034) | ||
| race_ethnh_asian | -0.086*** | -0.043*** | |
| (-0.111, -0.061) | (-0.067, -0.020) | ||
| race_ethnh_black | -0.210*** | -0.129*** | |
| (-0.230, -0.190) | (-0.147, -0.110) | ||
| race_ethnh_other | -0.207*** | -0.136*** | |
| (-0.244, -0.170) | (-0.170, -0.102) | ||
| mypovertynot poverty | 1.709*** | ||
| (1.660, 1.758) | |||
| mypovertypoverty | 0.347*** | ||
| (0.294, 0.401) | |||
| Constant | 10.634*** | 10.670*** | 8.997*** |
| (10.623, 10.645) | (10.659, 10.682) | (8.947, 9.048) | |
| N | 122,099 | 122,099 | 122,099 |
| R2 | 0.197 | 0.201 | 0.303 |
| Adjusted R2 | 0.197 | 0.201 | 0.303 |
| Residual Std. Error | 1.012 (df = 122092) | 1.010 (df = 122088) | 0.943 (df = 122086) |
| F Statistic | 4,983.395*** (df = 6; 122092) | 3,065.298*** (df = 10; 122088) | 4,430.510*** (df = 12; 122086) |
| p < .05; p < .01; p < .001 | |||