library(readr)
library(car)
library(broom)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(haven)
library(dplyr)
library(tidyr)
ipums<-read_dta("https://github.com/coreysparks/data/blob/master/usa_00045.dta?raw=true")
newpums<-ipums%>%
filter(labforce==2, age>=18, incwage>0, educd!=999, race!=9, empstat>0)%>% #filtering out the data
mutate(mywage= ifelse(incwage%in%c(999998,999999), NA,incwage), #outcome variable
sexrecode=ifelse(sex==1, "male", "female"))%>%
mutate(sex_recode=ifelse(sexrecode=="male", 1, 0))%>% #predictor variable 2
#age is predictor variable 1
mutate(edurec = case_when(.$educd %in% c(0:61)~"nohs",
.$educd %in% c(62:64)~"hs",
.$educd %in% c(65:100)~"somecoll",
.$educd %in% c(101:116)~"collgrad",
.$educd ==999 ~ "missing"))%>%
mutate(edu_rec=ifelse(edurec=="collgrad", 1,0))%>% #predictor variable 3
mutate(race_eth = case_when(.$hispan %in% c(1:4) & .$race %in%c(1:9) ~ "hispanic",
.$hispan ==0 & .$race==1 ~"0nh_white",
.$hispan ==0 & .$race==2 ~"nh_black",
.$hispan ==0 & .$race%in%c(3,7,8,9) ~"nh_other",
.$hispan ==0 & .$race%in%c(4:6) ~"nh_asian",
.$hispan==9 ~ "missing"))%>%
mutate(race.eth=ifelse(race_eth=="0nh_white", 1, 0))%>% #predictor variable 4
mutate(emp.stat=case_when (.$empstat== "1" ~ "Employed",
.$empstat== "2" ~ "Unemployed",
.$empstat== "3" ~ "Not in labor force"))%>%
mutate(emp_stat=ifelse(emp.stat=="Employed", 1, 0)) #predictor variable 5
mod1<-lm(log(mywage)~scale(age)+scale(I(age^2))+scale(sex_recode)+scale(edu_rec), data=newpums)
mod2<-lm(log(mywage)~scale(age)+scale(I(age^2))+scale(sex_recode)+scale(edu_rec)+scale(race.eth), data=newpums)
mod3<-lm(log(mywage)~scale(age)+scale(I(age^2))+scale(sex_recode)+scale(edu_rec)+scale(race.eth)+scale(emp_stat), data=newpums)
summary(mod1)
##
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec), data = newpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.9589 -0.4174 0.1469 0.5912 4.5380
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.299786 0.002648 3890.03 <2e-16 ***
## scale(age) 2.304498 0.015552 148.18 <2e-16 ***
## scale(I(age^2)) -2.056210 0.015538 -132.34 <2e-16 ***
## scale(sex_recode) 0.206483 0.002649 77.94 <2e-16 ***
## scale(edu_rec) 0.337050 0.002670 126.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9624 on 132126 degrees of freedom
## Multiple R-squared: 0.2962, Adjusted R-squared: 0.2962
## F-statistic: 1.39e+04 on 4 and 132126 DF, p-value: < 2.2e-16
summary(mod2)
##
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec) + scale(race.eth), data = newpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0001 -0.4105 0.1433 0.5903 4.5346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.299786 0.002640 3901.17 <2e-16 ***
## scale(age) 2.310552 0.015509 148.98 <2e-16 ***
## scale(I(age^2)) -2.071752 0.015504 -133.63 <2e-16 ***
## scale(sex_recode) 0.204928 0.002642 77.55 <2e-16 ***
## scale(edu_rec) 0.329827 0.002676 123.28 <2e-16 ***
## scale(race.eth) 0.073750 0.002677 27.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9597 on 132125 degrees of freedom
## Multiple R-squared: 0.3003, Adjusted R-squared: 0.3002
## F-statistic: 1.134e+04 on 5 and 132125 DF, p-value: < 2.2e-16
summary(mod3)
##
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec) + scale(race.eth) + scale(emp_stat), data = newpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0149 -0.4143 0.1293 0.5749 4.4712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.299786 0.002586 3982.77 <2e-16 ***
## scale(age) 2.249140 0.015214 147.84 <2e-16 ***
## scale(I(age^2)) -2.023094 0.015200 -133.10 <2e-16 ***
## scale(sex_recode) 0.207318 0.002588 80.09 <2e-16 ***
## scale(edu_rec) 0.321156 0.002623 122.43 <2e-16 ***
## scale(race.eth) 0.067901 0.002624 25.88 <2e-16 ***
## scale(emp_stat) 0.194389 0.002601 74.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.94 on 132124 degrees of freedom
## Multiple R-squared: 0.3286, Adjusted R-squared: 0.3286
## F-statistic: 1.078e+04 on 6 and 132124 DF, p-value: < 2.2e-16
anova(mod1, mod2, mod3)
## Analysis of Variance Table
##
## Model 1: log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec)
## Model 2: log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec) + scale(race.eth)
## Model 3: log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec) + scale(race.eth) + scale(emp_stat)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 132126 122389
## 2 132125 121691 1 698.9 790.88 < 2.2e-16 ***
## 3 132124 116754 1 4936.7 5586.54 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
AICs<-AIC(mod1, mod2, mod3)
AICs$diff<-AICs$AIC-AICs$AIC[1]
AICs
## df AIC diff
## mod1 6 364864.2 0.0000
## mod2 7 364109.5 -754.6644
## mod3 8 358639.6 -6224.6121
plot(mod3, which=1)
plot(mod3, which=2)
ks.test(rstudent(mod3), y=pnorm)
## Warning in ks.test(rstudent(mod3), y = pnorm): ties should not be present
## for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: rstudent(mod3)
## D = 0.084737, p-value < 2.2e-16
## alternative hypothesis: two-sided
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(mod3)
##
## studentized Breusch-Pagan test
##
## data: mod3
## BP = 2351.1, df = 6, p-value < 2.2e-16
coeftest(mod3, vcov. = hccm(mod3, type = "hc0")) #hc0 is the White's statndard errors
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.2997855 0.0025860 3982.879 < 2.2e-16 ***
## scale(age) 2.2491402 0.0189922 118.424 < 2.2e-16 ***
## scale(I(age^2)) -2.0230944 0.0194459 -104.037 < 2.2e-16 ***
## scale(sex_recode) 0.2073177 0.0025936 79.933 < 2.2e-16 ***
## scale(edu_rec) 0.3211558 0.0026434 121.495 < 2.2e-16 ***
## scale(race.eth) 0.0679009 0.0026136 25.980 < 2.2e-16 ***
## scale(emp_stat) 0.1943891 0.0036586 53.132 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(mod3)
##
## Call:
## lm(formula = log(mywage) ~ scale(age) + scale(I(age^2)) + scale(sex_recode) +
## scale(edu_rec) + scale(race.eth) + scale(emp_stat), data = newpums)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.0149 -0.4143 0.1293 0.5749 4.4712
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.299786 0.002586 3982.77 <2e-16 ***
## scale(age) 2.249140 0.015214 147.84 <2e-16 ***
## scale(I(age^2)) -2.023094 0.015200 -133.10 <2e-16 ***
## scale(sex_recode) 0.207318 0.002588 80.09 <2e-16 ***
## scale(edu_rec) 0.321156 0.002623 122.43 <2e-16 ***
## scale(race.eth) 0.067901 0.002624 25.88 <2e-16 ***
## scale(emp_stat) 0.194389 0.002601 74.74 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.94 on 132124 degrees of freedom
## Multiple R-squared: 0.3286, Adjusted R-squared: 0.3286
## F-statistic: 1.078e+04 on 6 and 132124 DF, p-value: < 2.2e-16
library(stargazer)
##
## Please cite as:
## Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2. http://CRAN.R-project.org/package=stargazer
stargazer(mod1, mod2, mod3, type = "html", style = "demography", ci = T)
| log(mywage) | |||
| Model 1 | Model 2 | Model 3 | |
| scale(age) | 2.304*** | 2.311*** | 2.249*** |
| (2.274, 2.335) | (2.280, 2.341) | (2.219, 2.279) | |
| scale(I(age2)) | -2.056*** | -2.072*** | -2.023*** |
| (-2.087, -2.026) | (-2.102, -2.041) | (-2.053, -1.993) | |
| scale(sex_recode) | 0.206*** | 0.205*** | 0.207*** |
| (0.201, 0.212) | (0.200, 0.210) | (0.202, 0.212) | |
| scale(edu_rec) | 0.337*** | 0.330*** | 0.321*** |
| (0.332, 0.342) | (0.325, 0.335) | (0.316, 0.326) | |
| scale(race.eth) | 0.074*** | 0.068*** | |
| (0.069, 0.079) | (0.063, 0.073) | ||
| scale(emp_stat) | 0.194*** | ||
| (0.189, 0.199) | |||
| Constant | 10.300*** | 10.300*** | 10.300*** |
| (10.295, 10.305) | (10.295, 10.305) | (10.295, 10.305) | |
| N | 132,131 | 132,131 | 132,131 |
| R2 | 0.296 | 0.300 | 0.329 |
| Adjusted R2 | 0.296 | 0.300 | 0.329 |
| Residual Std. Error | 0.962 (df = 132126) | 0.960 (df = 132125) | 0.940 (df = 132124) |
| F Statistic | 13,904.010*** (df = 4; 132126) | 11,338.760*** (df = 5; 132125) | 10,779.510*** (df = 6; 132124) |
| p < .05; p < .01; p < .001 | |||
library(sjPlot)
sjp.lm(mod3, sort.est = F, Title = "Regression effects from IPUMS data - Wage as outcome", show.summary = T)