DEM 7273 Homework 8

library(haven)
library(readr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)

## 
## Attaching package: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(ggplot2)
library(broom)

ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
names(ipums) #print the column names

##  [1] "year"      "datanum"   "serial"    "hhwt"      "statefip" 
##  [6] "met2013"   "puma"      "gq"        "ownershp"  "ownershpd"
## [11] "bedrooms"  "pernum"    "perwt"     "famsize"   "nchild"   
## [16] "nchlt5"    "eldch"     "nsibs"     "relate"    "related"  
## [21] "sex"       "age"       "marst"     "birthyr"   "fertyr"   
## [26] "race"      "raced"     "hispan"    "hispand"   "bpl"      
## [31] "bpld"      "citizen"   "yrsusa1"   "language"  "languaged"
## [36] "speakeng"  "educ"      "educd"     "empstat"   "empstatd" 
## [41] "labforce"  "occ"       "ind"       "inctot"    "incwage"  
## [46] "poverty"   "hwsei"     "migrate1"  "migrate1d" "carpool"  
## [51] "trantime"

Create dataset with one continuous outcome variable, one continuous predictor, and 5 other predictors

mypums <- ipums%>%
  filter(labforce==2, age>-18, incwage>0)%>%
  mutate(mywage= ifelse(incwage%in%c(999998, 999999), NA, incwage))%>%
  mutate(mytrantime=  ifelse(trantime==000, NA, trantime))%>%
  mutate(mysexcode= ifelse(sex==1, "male", "female"))%>%
  mutate(myeducd = case_when(.$educd %in% c(0:61)~"nohs", 
                            .$educd %in% c(62:64)~"hs",
                            .$educd %in% c(65:100)~"somecoll",
                            .$educd %in% c(101:116)~"collgrad", 
                            .$educd ==999 ~ "missing"))%>%
  mutate(race_eth = case_when(.$hispan %in% c(1:4) & .$race %in%c(1:9) ~ "hispanic", 
                          .$hispan ==0 & .$race==1 ~"0nh_white",
                         .$hispan ==0 & .$race==2 ~"nh_black",
                         .$hispan ==0 & .$race%in%c(3,7,8,9) ~"nh_other",
                         .$hispan ==0 & .$race%in%c(4:6) ~"nh_asian",
                          .$hispan==9 ~ "missing"))%>%
  mutate(mybpl= case_when(.$bpl %in%c (120:999)~"foreign born", .$bpl %in%c (1:119)~"US born"))%>%
  mutate(mypoverty= case_when(.$poverty %in%c (1:100)~"poverty", .$poverty %in%c (101:501)~"not poverty", .$poverty %in%c (0:1)~"NA"))

Estimate at least two nested OLS regression models for my outcome

mod1 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode,  data=mypums)
mod2 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth, data=mypums)
mod3 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode +  race_eth + mypoverty, data=mypums)

anova(mod1, mod2, mod3)

## Analysis of Variance Table
## 
## Model 1: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode
## Model 2: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth
## Model 3: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth + mypoverty
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1 122092 125076                                   
## 2 122088 124459  4     617.1  173.64 < 2.2e-16 ***
## 3 122086 108470  2   15988.5 8997.73 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Evaluate the assumptions of the model

plot(mod3, which=1)

plot(mod3, which=2)

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(mod3)

## 
##  studentized Breusch-Pagan test
## 
## data:  mod3
## BP = 1521.1, df = 12, p-value < 2.2e-16

ks.test(resid(mod3), y=pnorm)

## Warning in ks.test(resid(mod3), y = pnorm): ties should not be present for
## the Kolmogorov-Smirnov test

## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  resid(mod3)
## D = 0.098746, p-value < 2.2e-16
## alternative hypothesis: two-sided

Compare models to one another using both the F test and the AIC

mod1 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode,  data=mypums)
mod2 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode + race_eth, data=mypums)
mod3 <- lm(log(mywage)~scale(mytrantime)+ scale(I(mytrantime^2)) + myeducd + mysexcode +  race_eth + mypoverty, data=mypums)

anova(mod1, mod2, mod3)

## Analysis of Variance Table
## 
## Model 1: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode
## Model 2: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth
## Model 3: log(mywage) ~ scale(mytrantime) + scale(I(mytrantime^2)) + myeducd + 
##     mysexcode + race_eth + mypoverty
##   Res.Df    RSS Df Sum of Sq       F    Pr(>F)    
## 1 122092 125076                                   
## 2 122088 124459  4     617.1  173.64 < 2.2e-16 ***
## 3 122086 108470  2   15988.5 8997.73 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

AICs<-AIC(mod1, mod2, mod3)
AICs$diff <- AICs$AIC-AICs$AIC[1]
AICs

##      df      AIC        diff
## mod1  8 349459.3      0.0000
## mod2 12 348863.4   -595.9006
## mod3 14 332079.0 -17380.3198

Discuss the results of your final model (best fitting)

For my model, I used Incwage as my continuous outcome variable and trantime as my continuous predictor. Other predictors I used in my model were education, sex, race/ethnicity, poverty and birthplace. I created three nested OLS regression models and conducted an ANOVA on all three models. Model 3 was the best model because of the F test (8997.73) and the small P-value. I evaluated the assumptions of the model by plotting (residuals vs. fitted and normal Q-Q) and conducting the Breusch-Pagan test and the Kolmogorov-Smirnov tests. I finally compared the models to one another using the F test and the AIC (AIC value on model 3 was lower than the others) and concluded that Model 3 was the best fitting of the 3 that were used.

library(stargazer)

## 
## Please cite as:

##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2. http://CRAN.R-project.org/package=stargazer

stargazer(mod1, mod2, mod3, type = "html", style= "demography", ci= T)


	log(mywage)
	Model 1	Model 2	Model 3

scale(mytrantime)	0.324^***	0.334^***	0.265^***
	(0.310, 0.338)	(0.320, 0.348)	(0.252, 0.278)
scale(I(mytrantime2))	-0.218^***	-0.225^***	-0.172^***
	(-0.232, -0.204)	(-0.238, -0.211)	(-0.185, -0.159)
myeducdhs	-0.800^***	-0.785^***	-0.705^***
	(-0.815, -0.785)	(-0.801, -0.770)	(-0.719, -0.691)
myeducdnohs	-1.362^***	-1.323^***	-1.177^***
	(-1.384, -1.341)	(-1.345, -1.300)	(-1.198, -1.156)
myeducdsomecoll	-0.671^***	-0.659^***	-0.565^***
	(-0.685, -0.657)	(-0.673, -0.645)	(-0.578, -0.552)
mysexcodemale	0.413^***	0.407^***	0.378^***
	(0.402, 0.425)	(0.395, 0.418)	(0.367, 0.388)
race_ethhispanic		-0.107^***	-0.051^***
		(-0.124, -0.090)	(-0.067, -0.034)
race_ethnh_asian		-0.086^***	-0.043^***
		(-0.111, -0.061)	(-0.067, -0.020)
race_ethnh_black		-0.210^***	-0.129^***
		(-0.230, -0.190)	(-0.147, -0.110)
race_ethnh_other		-0.207^***	-0.136^***
		(-0.244, -0.170)	(-0.170, -0.102)
mypovertynot poverty			1.709^***
			(1.660, 1.758)
mypovertypoverty			0.347^***
			(0.294, 0.401)
Constant	10.634^***	10.670^***	8.997^***
	(10.623, 10.645)	(10.659, 10.682)	(8.947, 9.048)
N	122,099	122,099	122,099
R²	0.197	0.201	0.303
Adjusted R²	0.197	0.201	0.303
Residual Std. Error	1.012 (df = 122092)	1.010 (df = 122088)	0.943 (df = 122086)
F Statistic	4,983.395^*** (df = 6; 122092)	3,065.298^*** (df = 10; 122088)	4,430.510^*** (df = 12; 122086)

p < .05; p < .01; p < .001