library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(readr)
ds1 <- read_csv("C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv")
##
## -- Column specification --------------------------------------------------------
## cols(
## .default = col_double(),
## ASECFLAG = col_logical(),
## COVIDUNAW = col_logical(),
## COVIDLOOK = col_logical()
## )
## i Use `spec()` for the full column specifications.
## Warning: 33939 parsing failures.
## row col expected actual file
## 33909 ASECFLAG 1/0/T/F/TRUE/FALSE 2 'C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv'
## 33910 ASECFLAG 1/0/T/F/TRUE/FALSE 2 'C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv'
## 33911 ASECFLAG 1/0/T/F/TRUE/FALSE 2 'C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv'
## 33912 ASECFLAG 1/0/T/F/TRUE/FALSE 2 'C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv'
## 33913 ASECFLAG 1/0/T/F/TRUE/FALSE 2 'C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv'
## ..... ........ .................. ...... ..........................................................................................
## See problems(...) for more details.
#The names in the data are very ugly, so I make them less ugly
nams<-names(ds1)
head(nams, n=10)
## [1] "YEAR" "SERIAL" "MONTH" "HWTFINL" "CPSID" "ASECFLAG"
## [7] "STATEFIP" "COUNTY" "FAMINC" "PERNUM"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement = "",x = nams))
names(ds1)<-newnames
summary(ds1)
## year serial month hwtfinl
## Min. :2019 Min. :56720 Min. : 1.000 Min. : 2336
## 1st Qu.:2019 1st Qu.:58433 1st Qu.: 3.000 1st Qu.: 3864
## Median :2020 Median :59368 Median : 8.000 Median : 4310
## Mean :2020 Mean :59379 Mean : 6.962 Mean : 4557
## 3rd Qu.:2020 3rd Qu.:60320 3rd Qu.:10.000 3rd Qu.: 4932
## Max. :2021 Max. :62078 Max. :12.000 Max. :21835
## cpsid asecflag statefip county
## Min. :2.018e+13 Mode:logical Min. :48 Min. : 0
## 1st Qu.:2.019e+13 NA's:77641 1st Qu.:48 1st Qu.: 0
## Median :2.019e+13 Median :48 Median : 0
## Mean :2.019e+13 Mean :48 Mean : 7452
## 3rd Qu.:2.020e+13 3rd Qu.:48 3rd Qu.: 0
## Max. :2.021e+13 Max. :48 Max. :48485
## faminc pernum wtfinl cpsidp
## Min. :100.0 Min. : 1.000 Min. : 0 Min. :2.018e+13
## 1st Qu.:730.0 1st Qu.: 1.000 1st Qu.: 3968 1st Qu.:2.019e+13
## Median :830.0 Median : 2.000 Median : 4476 Median :2.019e+13
## Mean :754.5 Mean : 1.946 Mean : 4745 Mean :2.019e+13
## 3rd Qu.:842.0 3rd Qu.: 2.000 3rd Qu.: 5191 3rd Qu.:2.020e+13
## Max. :843.0 Max. :14.000 Max. :21835 Max. :2.021e+13
## age sex race hispan empstat
## Min. :16 Min. :1.000 Min. :100.0 Min. : 0.0 Min. : 1.0
## 1st Qu.:28 1st Qu.:1.000 1st Qu.:100.0 1st Qu.: 0.0 1st Qu.:10.0
## Median :40 Median :2.000 Median :100.0 Median : 0.0 Median :10.0
## Mean :40 Mean :1.517 Mean :157.7 Mean : 66.7 Mean :17.2
## 3rd Qu.:52 3rd Qu.:2.000 3rd Qu.:100.0 3rd Qu.:100.0 3rd Qu.:32.0
## Max. :64 Max. :2.000 Max. :814.0 Max. :612.0 Max. :36.0
## ind classwkr durunem2 whyunemp
## Min. : 0 Min. : 0.00 Min. : 0.0000 Min. :0.0000
## 1st Qu.: 0 1st Qu.: 0.00 1st Qu.: 0.0000 1st Qu.:0.0000
## Median :5180 Median :22.00 Median : 0.0000 Median :0.0000
## Mean :4444 Mean :15.91 Mean : 0.3218 Mean :0.1185
## 3rd Qu.:7860 3rd Qu.:22.00 3rd Qu.: 0.0000 3rd Qu.:0.0000
## Max. :9890 Max. :29.00 Max. :16.0000 Max. :6.0000
## whyptlwk wnlook wkstat empsame
## Min. : 0.00 Min. : 1.0 Min. :11.00 Min. : 1.00
## 1st Qu.: 0.00 1st Qu.:999.0 1st Qu.:11.00 1st Qu.: 2.00
## Median : 0.00 Median :999.0 Median :11.00 Median :99.00
## Mean : 11.78 Mean :978.2 Mean :40.32 Mean :58.89
## 3rd Qu.: 0.00 3rd Qu.:999.0 3rd Qu.:99.00 3rd Qu.:99.00
## Max. :130.00 Max. :999.0 Max. :99.00 Max. :99.00
## wrkoffer earnwt hourwage paidhour
## Min. : 1.00 Min. : 0 Min. : 1.01 Min. :0.0000
## 1st Qu.:99.00 1st Qu.: 0 1st Qu.: 999.99 1st Qu.:0.0000
## Median :99.00 Median : 0 Median : 999.99 Median :0.0000
## Mean :98.27 Mean : 4726 Mean : 917.63 Mean :0.2397
## 3rd Qu.:99.00 3rd Qu.:12776 3rd Qu.: 999.99 3rd Qu.:0.0000
## Max. :99.00 Max. :65538 Max. : 999.99 Max. :2.0000
## union earnweek uhrsworkorg eligorg
## Min. :0.0000 Min. : 0.01 Min. : 1.0 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 9999.99 1st Qu.:999.0 1st Qu.:0.0000
## Median :0.0000 Median : 9999.99 Median :999.0 Median :0.0000
## Mean :0.1667 Mean : 8604.45 Mean :944.5 Mean :0.1558
## 3rd Qu.:0.0000 3rd Qu.: 9999.99 3rd Qu.:999.0 3rd Qu.:0.0000
## Max. :3.0000 Max. : 9999.99 Max. :999.0 Max. :1.0000
## otpay covidunaw covidlook
## Min. : 1.00 Mode:logical Mode:logical
## 1st Qu.:99.00 TRUE:33325 TRUE:9662
## Median :99.00 NA's:44316 NA's:67979
## Mean :83.76
## 3rd Qu.:99.00
## Max. :99.00
library(questionr)
library(dplyr)
ds2<-ds1%>%
mutate(
covid = as.factor(if_else(year==2020 & month>=4 | year==2021 & month<3,1,0)),
covidint = as.factor(if_else(year==2019 & month<=9,"1precov_Q32019",
if_else(year==2019 & month>=10,"2precov_Q42019",
if_else(year==2020 & month<=2,"3precov_Q12020",
if_else(year==2020 & month>=3 & month<=8,"4earlycov_Q22020",
if_else(year==2020 & month>=9 & month<=12,"5midcov_Q32020",
if_else(year==2021 & month>=1 & month<=3,"6latecov_Q42020","NA"))))))
),
hhinc =Recode(faminc,recodes="100:490='1_lt15k';500:600='2_15-25k';710:720='3_25-35k';
730:740='4_35-50k';810:830='5_50k-75k';840:843='6_75kplus';999='NA'", as.factor = T),
agegrp=cut(age, breaks=c(16,24,34,44,54,64)),
male=Recode(sex,recodes="2='0';1='1';9=NA", as.factor = T),
raceeth=as.factor(if_else(race==100 & hispan==000,"NH_wht",
if_else(race==200 & hispan==000, "NH_blk",
if_else(!race%in%c('100','200','999') & hispan==000, "NH_other",
if_else(!hispan%in%c('000','901','902'), "Hisp","NA"))))
),
employst=Recode( empstat,recodes="1='Mil';10='emp';12='emp_nwklwk';20:22='unemp';32='NILF_utw';34='NILF_oth';36='NILF_ret'; else=NA" , as.factor = T),
unemployed=Recode( empstat,recodes="1:12='0';20:22='1';else=NA"),
industry=Recode( ind,recodes="770='const';1070:3990='manuf';4070:4590='whsale';4670:5790='retail';6070:6390='trans';570:690='utl';
6470:6672='media_ent';6870:6992='bnk_ins';7071:7072='realest';7860:7890='Teach';7970:8290='Health';8670:8690='rstrnt_svc';8770:8891='repair_maint';8970:8990='perscare';9160:9390='gen_supprt';9470='crimjust'; else=NA", as.factor = T ),
selfemp=Recode( classwkr,recodes="10:14='1';20:29='0';else=NA" ),
uedurwks =Recode( durunem2,recodes="0:3='lt4wks';4:7='4-10wks';8:10='11-22wks';11:15='23-52wks';16='gt52wks'; else=NA", as.factor = T),
whyue =Recode( whyunemp,recodes="1='templayoff';2='jobloss';3='tempwk_end';4='leftjob';5='re_ent';6='new_ent'; else=NA" , as.factor = T),
whypt =Recode( whyptlwk,recodes="60='onlyfindpt';80='ftdown';100:101='health_med';111='persday';120:122='chcare_fam';
123='sch';130='oth'; else=NA" , as.factor = T),
underemp=Recode( wkstat,recodes="12='1'; 13='1'; 21='1';99=NA; else='0'" ),#reconsider recodes
sameemp=Recode( empsame,recodes="2='1';1='0';else=NA" ),
offerwrk=Recode( wrkoffer,recodes= "2='0';1='1';else=NA"),
paidhourly=Recode( paidhour,recodes="2='1';1='0';else=NA"),
unioncover=Recode( union,recodes="2:3='1';1='0';else=NA"),
earnerstud=as.factor(x=eligorg),
ottippay=Recode( otpay,recodes="2='1';1='0';else=NA"),
earnweekna=recode.na(earnweek,9999.99,as.numeric = T),
earnyear=earnweek*52
)
## Recoded 65562 values to NA.
ds2a<-ds2%>%
filter(earnerstud==1 )%>%
select(earnweekna, hhinc, covidint, agegrp, male, raceeth, industry)
ds2b<-ds2%>%
filter(earnerstud==0 )%>%
select(earnweekna, hhinc, covidint, agegrp, male, raceeth, industry)
set.seed(410)
samp<-sample(1:dim(ds2b)[1], size = 1000) #smaller sample from non-earner study group
ds2b<-ds2b[samp,]
ds3<-rbind(ds2a,ds2b)
summary(ds3)
## earnweekna hhinc covidint agegrp
## Min. : 1 1_lt15k : 679 1precov_Q32019 :2114 (16,24]:1654
## 1st Qu.: 579 2_15-25k : 681 2precov_Q42019 :2211 (24,34]:3001
## Median : 972 3_25-35k :1120 3precov_Q12020 :1483 (34,44]:3059
## Mean :1003 4_35-50k :1614 4earlycov_Q22020:3370 (44,54]:2892
## 3rd Qu.:1416 5_50k-75k:2472 5midcov_Q32020 :2674 (54,64]:2412
## Max. :1843 6_75kplus:6529 6latecov_Q42020 :1243 NA's : 77
## NA's :1016
## male raceeth industry
## 0:6302 Hisp :4740 Health :1515
## 1:6793 NH_blk :1432 retail :1447
## NH_other: 971 Teach :1351
## NH_wht :5952 manuf :1101
## const : 964
## (Other):3700
## NA's :3017
ds3$wkearn.mean<-ifelse(is.na(ds3$earnweekna)==T, mean(ds3$earnweekna, na.rm=T), ds3$earnweekna)
mean(ds3$earnweekna, na.rm=T)
## [1] 1003.404
mean(ds3$wkearn.mean)
## [1] 1003.404
median(ds3$earnweekna, na.rm=T)
## [1] 972
median(ds3$wkearn.mean)
## [1] 1003.404
var(ds3$earnweekna, na.rm=T)
## [1] 251959.4
var(ds3$wkearn.mean)
## [1] 232409.2
library(reshape2)
ds3%>%
select(wkearn.mean, earnweekna)%>%
melt()%>%
ggplot()+geom_freqpoly(aes(x = value,
y = ..density.., colour = variable))
## No id variables; using all as measure variables
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1016 rows containing non-finite values (stat_bin).
##Testing for MAR
fitmar<-lm(earnweekna~hhinc+covidint+agegrp+raceeth+male+is.na(industry), ds3)
summary(fitmar)
##
## Call:
## lm(formula = earnweekna ~ hhinc + covidint + agegrp + raceeth +
## male + is.na(industry), data = ds3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1386.39 -285.21 15.58 305.95 1652.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 108.4544 21.0854 5.144 2.74e-07 ***
## hhinc2_15-25k 10.3051 23.5193 0.438 0.66128
## hhinc3_25-35k 56.4508 21.0323 2.684 0.00728 **
## hhinc4_35-50k 130.1379 19.7893 6.576 5.03e-11 ***
## hhinc5_50k-75k 239.3397 18.8599 12.690 < 2e-16 ***
## hhinc6_75kplus 465.2678 17.8290 26.096 < 2e-16 ***
## covidint2precov_Q42019 -0.3496 12.8617 -0.027 0.97832
## covidint3precov_Q12020 25.3863 14.3363 1.771 0.07662 .
## covidint4earlycov_Q22020 14.8477 11.7747 1.261 0.20734
## covidint5midcov_Q32020 18.2010 12.3558 1.473 0.14076
## covidint6latecov_Q42020 3.4219 15.1537 0.226 0.82135
## agegrp(24,34] 404.4469 13.0457 31.002 < 2e-16 ***
## agegrp(34,44] 488.6643 13.0471 37.454 < 2e-16 ***
## agegrp(44,54] 492.0499 13.1771 37.341 < 2e-16 ***
## agegrp(54,64] 470.6266 13.7552 34.214 < 2e-16 ***
## raceethNH_blk 50.6578 12.8216 3.951 7.83e-05 ***
## raceethNH_other 132.0393 15.1258 8.729 < 2e-16 ***
## raceethNH_wht 136.5762 8.6503 15.789 < 2e-16 ***
## male1 171.1952 7.4268 23.051 < 2e-16 ***
## is.na(industry)TRUE 63.4951 9.1209 6.961 3.54e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 405.8 on 12015 degrees of freedom
## (1060 observations deleted due to missingness)
## Multiple R-squared: 0.3435, Adjusted R-squared: 0.3425
## F-statistic: 330.9 on 19 and 12015 DF, p-value: < 2.2e-16
#Those with missing industry have higher weekly incomes, suggesting that weekly income may not be missing at random with respect to industry.
##Pattern of Missingness
md.pattern(ds3[,c("earnweekna", "hhinc", "agegrp","covidint","raceeth","male","industry")])
## hhinc covidint raceeth male agegrp earnweekna industry
## 9501 1 1 1 1 1 1 1 0
## 2534 1 1 1 1 1 1 0 1
## 538 1 1 1 1 1 0 1 1
## 445 1 1 1 1 1 0 0 2
## 37 1 1 1 1 0 1 1 1
## 7 1 1 1 1 0 1 0 2
## 2 1 1 1 1 0 0 1 2
## 31 1 1 1 1 0 0 0 3
## 0 0 0 0 77 1016 3017 4110
##Multiple Imputation
dat2<-ds3
samp2<-sample(1:dim(dat2)[1], replace = F, size = 500)
dat2$earnwkknock<-dat2$earnweekna
dat2$earnwkknock[samp2]<-NA
head(dat2[, c("earnwkknock","earnweekna")])
imp<-mice(data = dat2[,c("earnweekna", "hhinc", "agegrp","covidint","raceeth","male","industry")], m = 5)
##
## iter imp variable
## 1 1 earnweekna agegrp industry
## 1 2 earnweekna agegrp industry
## 1 3 earnweekna agegrp industry
## 1 4 earnweekna agegrp industry
## 1 5 earnweekna agegrp industry
## 2 1 earnweekna agegrp industry
## 2 2 earnweekna agegrp industry
## 2 3 earnweekna agegrp industry
## 2 4 earnweekna agegrp industry
## 2 5 earnweekna agegrp industry
## 3 1 earnweekna agegrp industry
## 3 2 earnweekna agegrp industry
## 3 3 earnweekna agegrp industry
## 3 4 earnweekna agegrp industry
## 3 5 earnweekna agegrp industry
## 4 1 earnweekna agegrp industry
## 4 2 earnweekna agegrp industry
## 4 3 earnweekna agegrp industry
## 4 4 earnweekna agegrp industry
## 4 5 earnweekna agegrp industry
## 5 1 earnweekna agegrp industry
## 5 2 earnweekna agegrp industry
## 5 3 earnweekna agegrp industry
## 5 4 earnweekna agegrp industry
## 5 5 earnweekna agegrp industry
print(imp)
## Class: mids
## Number of multiple imputations: 5
## Imputation methods:
## earnweekna hhinc agegrp covidint raceeth male industry
## "pmm" "" "polyreg" "" "" "" "polyreg"
## PredictorMatrix:
## earnweekna hhinc agegrp covidint raceeth male industry
## earnweekna 0 1 1 1 1 1 1
## hhinc 1 0 1 1 1 1 1
## agegrp 1 1 0 1 1 1 1
## covidint 1 1 1 0 1 1 1
## raceeth 1 1 1 1 0 1 1
## male 1 1 1 1 1 0 1
head(imp$imp$earnweekna)
summary(imp$imp$earnweekna)
## 1 2 3 4
## Min. : 3.0 Min. : 13.0 Min. : 6.0 Min. : 17.0
## 1st Qu.: 538.8 1st Qu.: 515.0 1st Qu.: 526.0 1st Qu.: 550.2
## Median : 908.0 Median : 849.0 Median : 931.0 Median : 927.5
## Mean : 944.5 Mean : 933.4 Mean : 948.6 Mean : 943.7
## 3rd Qu.:1334.0 3rd Qu.:1357.8 3rd Qu.:1346.8 3rd Qu.:1334.5
## Max. :1843.0 Max. :1843.0 Max. :1843.0 Max. :1843.0
## 5
## Min. : 6.0
## 1st Qu.: 548.0
## Median : 931.0
## Mean : 958.2
## 3rd Qu.:1346.0
## Max. :1843.0
summary(ds3$earnweekna)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1 579 972 1003 1416 1843 1016
head(imp$imp$industry)
summary(imp$imp$industry)
## 1 2 3 4 5
## Health :457 Health :479 Health :440 Health :462 Health :496
## Teach :430 Teach :423 retail :440 Teach :396 retail :392
## retail :418 retail :380 Teach :395 retail :394 Teach :387
## manuf :342 manuf :327 manuf :353 manuf :336 manuf :367
## const :277 const :280 const :263 const :302 const :284
## bnk_ins:226 trans :224 bnk_ins:236 bnk_ins:238 bnk_ins:252
## (Other):867 (Other):904 (Other):890 (Other):889 (Other):839
library(lattice)
stripplot(imp,earnweekna~raceeth|.imp, pch=20)
dat.imp<-complete(imp, action = 1)
head(dat.imp[is.na(ds3$earnweekna)==T,], n=10)
head(ds3[is.na(ds3$earnweekna)==T,c("earnweekna", "hhinc", "agegrp","covidint","raceeth","male","industry")], n=10)
fit.wkinc<-with(data=imp ,expr=lm(earnweekna~hhinc+agegrp+covidint+raceeth+male+industry))
fit.wkinc
## call :
## with.mids(data = imp, expr = lm(earnweekna ~ hhinc + agegrp +
## covidint + raceeth + male + industry))
##
## call1 :
## mice(data = dat2[, c("earnweekna", "hhinc", "agegrp", "covidint",
## "raceeth", "male", "industry")], m = 5)
##
## nmis :
## earnweekna hhinc agegrp covidint raceeth male industry
## 1016 0 77 0 0 0 3017
##
## analyses :
## [[1]]
##
## Call:
## lm(formula = earnweekna ~ hhinc + agegrp + covidint + raceeth +
## male + industry)
##
## Coefficients:
## (Intercept) hhinc2_15-25k hhinc3_25-35k
## 413.990 28.086 59.157
## hhinc4_35-50k hhinc5_50k-75k hhinc6_75kplus
## 119.043 222.515 432.958
## agegrp(24,34] agegrp(34,44] agegrp(44,54]
## 327.995 411.652 417.254
## agegrp(54,64] covidint2precov_Q42019 covidint3precov_Q12020
## 391.685 -10.769 20.211
## covidint4earlycov_Q22020 covidint5midcov_Q32020 covidint6latecov_Q42020
## 4.397 8.301 -4.313
## raceethNH_blk raceethNH_other raceethNH_wht
## 41.084 143.043 134.036
## male1 industryconst industrycrimjust
## 165.055 -155.432 -99.961
## industrygen_supprt industryHealth industrymanuf
## -240.305 -157.513 -118.035
## industrymedia_ent industryperscare industryrealest
## -214.152 -387.994 -131.634
## industryrepair_maint industryretail industryrstrnt_svc
## -245.155 -321.806 -427.733
## industryTeach industrytrans industryutl
## -180.548 -185.299 -18.774
## industrywhsale
## -190.564
##
##
## [[2]]
##
## Call:
## lm(formula = earnweekna ~ hhinc + agegrp + covidint + raceeth +
## male + industry)
##
## Coefficients:
## (Intercept) hhinc2_15-25k hhinc3_25-35k
## 404.728 11.603 52.868
## hhinc4_35-50k hhinc5_50k-75k hhinc6_75kplus
## 114.845 214.995 433.476
## agegrp(24,34] agegrp(34,44] agegrp(44,54]
## 331.604 414.167 422.755
## agegrp(54,64] covidint2precov_Q42019 covidint3precov_Q12020
## 394.315 -2.010 22.036
## covidint4earlycov_Q22020 covidint5midcov_Q32020 covidint6latecov_Q42020
## 8.273 12.648 -5.388
## raceethNH_blk raceethNH_other raceethNH_wht
## 41.005 146.208 132.259
## male1 industryconst industrycrimjust
## 163.726 -139.429 -107.904
## industrygen_supprt industryHealth industrymanuf
## -239.695 -152.400 -120.043
## industrymedia_ent industryperscare industryrealest
## -228.755 -434.742 -135.243
## industryrepair_maint industryretail industryrstrnt_svc
## -228.377 -309.503 -408.997
## industryTeach industrytrans industryutl
## -175.187 -184.954 -39.757
## industrywhsale
## -162.555
##
##
## [[3]]
##
## Call:
## lm(formula = earnweekna ~ hhinc + agegrp + covidint + raceeth +
## male + industry)
##
## Coefficients:
## (Intercept) hhinc2_15-25k hhinc3_25-35k
## 404.874 23.042 58.569
## hhinc4_35-50k hhinc5_50k-75k hhinc6_75kplus
## 112.800 222.052 428.290
## agegrp(24,34] agegrp(34,44] agegrp(44,54]
## 337.091 417.495 421.676
## agegrp(54,64] covidint2precov_Q42019 covidint3precov_Q12020
## 389.940 -3.361 20.125
## covidint4earlycov_Q22020 covidint5midcov_Q32020 covidint6latecov_Q42020
## 9.554 9.907 2.436
## raceethNH_blk raceethNH_other raceethNH_wht
## 45.266 142.534 132.884
## male1 industryconst industrycrimjust
## 161.896 -143.321 -87.369
## industrygen_supprt industryHealth industrymanuf
## -241.269 -152.248 -110.352
## industrymedia_ent industryperscare industryrealest
## -224.570 -407.861 -133.023
## industryrepair_maint industryretail industryrstrnt_svc
## -227.647 -314.851 -424.885
## industryTeach industrytrans industryutl
## -165.146 -179.743 -3.626
## industrywhsale
## -185.576
##
##
## [[4]]
##
## Call:
## lm(formula = earnweekna ~ hhinc + agegrp + covidint + raceeth +
## male + industry)
##
## Coefficients:
## (Intercept) hhinc2_15-25k hhinc3_25-35k
## 401.203 17.193 63.127
## hhinc4_35-50k hhinc5_50k-75k hhinc6_75kplus
## 123.205 220.975 430.359
## agegrp(24,34] agegrp(34,44] agegrp(44,54]
## 328.918 412.331 413.321
## agegrp(54,64] covidint2precov_Q42019 covidint3precov_Q12020
## 389.173 -8.918 18.760
## covidint4earlycov_Q22020 covidint5midcov_Q32020 covidint6latecov_Q42020
## 7.652 10.534 -6.355
## raceethNH_blk raceethNH_other raceethNH_wht
## 49.598 136.272 136.207
## male1 industryconst industrycrimjust
## 166.707 -143.036 -89.088
## industrygen_supprt industryHealth industrymanuf
## -225.693 -146.695 -109.175
## industrymedia_ent industryperscare industryrealest
## -202.995 -378.792 -131.005
## industryrepair_maint industryretail industryrstrnt_svc
## -225.252 -307.456 -422.689
## industryTeach industrytrans industryutl
## -165.937 -175.720 -41.066
## industrywhsale
## -156.684
##
##
## [[5]]
##
## Call:
## lm(formula = earnweekna ~ hhinc + agegrp + covidint + raceeth +
## male + industry)
##
## Coefficients:
## (Intercept) hhinc2_15-25k hhinc3_25-35k
## 412.326 18.449 46.625
## hhinc4_35-50k hhinc5_50k-75k hhinc6_75kplus
## 114.346 217.858 429.159
## agegrp(24,34] agegrp(34,44] agegrp(44,54]
## 337.919 415.668 417.913
## agegrp(54,64] covidint2precov_Q42019 covidint3precov_Q12020
## 393.794 -7.232 22.522
## covidint4earlycov_Q22020 covidint5midcov_Q32020 covidint6latecov_Q42020
## 11.952 16.589 2.484
## raceethNH_blk raceethNH_other raceethNH_wht
## 42.805 132.003 131.266
## male1 industryconst industrycrimjust
## 165.836 -163.389 -101.188
## industrygen_supprt industryHealth industrymanuf
## -239.904 -150.018 -124.488
## industrymedia_ent industryperscare industryrealest
## -223.122 -395.397 -160.898
## industryrepair_maint industryretail industryrstrnt_svc
## -261.807 -320.941 -426.935
## industryTeach industrytrans industryutl
## -181.117 -180.402 -42.377
## industrywhsale
## -172.075
with (data=imp, exp=(sd(earnweekna)))
## call :
## with.mids(data = imp, expr = (sd(earnweekna)))
##
## call1 :
## mice(data = dat2[, c("earnweekna", "hhinc", "agegrp", "covidint",
## "raceeth", "male", "industry")], m = 5)
##
## nmis :
## earnweekna hhinc agegrp covidint raceeth male industry
## 1016 0 77 0 0 0 3017
##
## analyses :
## [[1]]
## [1] 501.9464
##
## [[2]]
## [1] 503.3447
##
## [[3]]
## [1] 501.7715
##
## [[4]]
## [1] 501.0831
##
## [[5]]
## [1] 501.6806
with (data=imp, exp=(prop.table(table(hhinc))))
## call :
## with.mids(data = imp, expr = (prop.table(table(hhinc))))
##
## call1 :
## mice(data = dat2[, c("earnweekna", "hhinc", "agegrp", "covidint",
## "raceeth", "male", "industry")], m = 5)
##
## nmis :
## earnweekna hhinc agegrp covidint raceeth male industry
## 1016 0 77 0 0 0 3017
##
## analyses :
## [[1]]
## hhinc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50k-75k 6_75kplus
## 0.05185185 0.05200458 0.08552883 0.12325315 0.18877434 0.49858725
##
## [[2]]
## hhinc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50k-75k 6_75kplus
## 0.05185185 0.05200458 0.08552883 0.12325315 0.18877434 0.49858725
##
## [[3]]
## hhinc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50k-75k 6_75kplus
## 0.05185185 0.05200458 0.08552883 0.12325315 0.18877434 0.49858725
##
## [[4]]
## hhinc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50k-75k 6_75kplus
## 0.05185185 0.05200458 0.08552883 0.12325315 0.18877434 0.49858725
##
## [[5]]
## hhinc
## 1_lt15k 2_15-25k 3_25-35k 4_35-50k 5_50k-75k 6_75kplus
## 0.05185185 0.05200458 0.08552883 0.12325315 0.18877434 0.49858725
with (data=imp, exp=(prop.table(table(raceeth))))
## call :
## with.mids(data = imp, expr = (prop.table(table(raceeth))))
##
## call1 :
## mice(data = dat2[, c("earnweekna", "hhinc", "agegrp", "covidint",
## "raceeth", "male", "industry")], m = 5)
##
## nmis :
## earnweekna hhinc agegrp covidint raceeth male industry
## 1016 0 77 0 0 0 3017
##
## analyses :
## [[1]]
## raceeth
## Hisp NH_blk NH_other NH_wht
## 0.36197022 0.10935472 0.07415044 0.45452463
##
## [[2]]
## raceeth
## Hisp NH_blk NH_other NH_wht
## 0.36197022 0.10935472 0.07415044 0.45452463
##
## [[3]]
## raceeth
## Hisp NH_blk NH_other NH_wht
## 0.36197022 0.10935472 0.07415044 0.45452463
##
## [[4]]
## raceeth
## Hisp NH_blk NH_other NH_wht
## 0.36197022 0.10935472 0.07415044 0.45452463
##
## [[5]]
## raceeth
## Hisp NH_blk NH_other NH_wht
## 0.36197022 0.10935472 0.07415044 0.45452463
with (data=imp, exp=(prop.table(table(agegrp))))
## call :
## with.mids(data = imp, expr = (prop.table(table(agegrp))))
##
## call1 :
## mice(data = dat2[, c("earnweekna", "hhinc", "agegrp", "covidint",
## "raceeth", "male", "industry")], m = 5)
##
## nmis :
## earnweekna hhinc agegrp covidint raceeth male industry
## 1016 0 77 0 0 0 3017
##
## analyses :
## [[1]]
## agegrp
## (16,24] (24,34] (34,44] (44,54] (54,64]
## 0.1285223 0.2306224 0.2347461 0.2216113 0.1844979
##
## [[2]]
## agegrp
## (16,24] (24,34] (34,44] (44,54] (54,64]
## 0.1287514 0.2303169 0.2342879 0.2219168 0.1847270
##
## [[3]]
## agegrp
## (16,24] (24,34] (34,44] (44,54] (54,64]
## 0.1280641 0.2301642 0.2348988 0.2219168 0.1849561
##
## [[4]]
## agegrp
## (16,24] (24,34] (34,44] (44,54] (54,64]
## 0.1285987 0.2304696 0.2346697 0.2216113 0.1846506
##
## [[5]]
## agegrp
## (16,24] (24,34] (34,44] (44,54] (54,64]
## 0.1288278 0.2301642 0.2345170 0.2217640 0.1847270
est.p<-pool(fit.wkinc)
print(est.p)
## Class: mipo m = 5
## term m estimate ubar b t
## 1 (Intercept) 5 407.423970 573.46379 29.904755 609.34950
## 2 hhinc2_15-25k 5 19.674521 459.52126 38.723072 505.98894
## 3 hhinc3_25-35k 5 56.069177 369.85935 41.262112 419.37389
## 4 hhinc4_35-50k 5 116.847773 327.28738 17.972563 348.85446
## 5 hhinc5_50k-75k 5 219.678972 297.13457 10.153007 309.31818
## 6 hhinc6_75kplus 5 430.848453 265.59740 5.248646 271.89578
## 7 agegrp(24,34] 5 332.705284 153.25133 21.040591 178.50004
## 8 agegrp(34,44] 5 414.262530 155.12874 5.744996 162.02273
## 9 agegrp(44,54] 5 418.583630 158.67001 14.217186 175.73063
## 10 agegrp(54,64] 5 391.781382 171.81283 5.168205 178.01467
## 11 covidint2precov_Q42019 5 -6.457798 144.30905 13.652856 160.69248
## 12 covidint3precov_Q12020 5 20.730708 179.50009 2.358605 182.33041
## 13 covidint4earlycov_Q22020 5 8.365759 120.80773 7.636276 129.97126
## 14 covidint5midcov_Q32020 5 11.595577 132.59360 10.219570 144.85709
## 15 covidint6latecov_Q42020 5 -2.227088 200.31288 18.830905 222.90996
## 16 raceethNH_blk 5 43.951646 145.56693 12.955969 161.11410
## 17 raceethNH_other 5 140.012036 202.20739 33.015700 241.82623
## 18 raceethNH_wht 5 133.330418 65.77275 3.595442 70.08728
## 19 male1 5 164.644222 59.12333 3.560269 63.39565
## 20 industryconst 5 -148.921441 322.53046 101.951507 444.87227
## 21 industrycrimjust 5 -97.101899 644.07478 75.128643 734.22915
## 22 industrygen_supprt 5 -237.373030 544.43929 42.998444 596.03743
## 23 industryHealth 5 -151.774786 255.13503 15.608440 273.86516
## 24 industrymanuf 5 -116.418406 288.90247 42.533936 339.94320
## 25 industrymedia_ent 5 -218.718963 1638.05955 105.606895 1764.78782
## 26 industryperscare 5 -400.957168 1601.59619 469.845920 2165.41129
## 27 industryrealest 5 -138.360559 1289.42432 161.376365 1483.07596
## 28 industryrepair_maint 5 -237.647552 871.15167 244.904812 1165.03744
## 29 industryretail 5 -314.911450 264.55352 42.180828 315.17052
## 30 industryrstrnt_svc 5 -422.247956 343.76605 58.696104 414.20138
## 31 industryTeach 5 -173.586932 264.12769 59.371998 335.37409
## 32 industrytrans 5 -181.223716 367.25890 15.920080 386.36300
## 33 industryutl 5 -29.119961 851.47627 297.145157 1208.05046
## 34 industrywhsale 5 -173.490757 535.13068 210.399753 787.61038
## dfcom df riv lambda fmi
## 1 13061 1054.37486 0.06257711 0.05889183 0.06067191
## 2 13061 456.04768 0.10112195 0.09183538 0.09579211
## 3 13061 279.96885 0.13387396 0.11806776 0.12430118
## 4 13061 964.20036 0.06589645 0.06182256 0.06376255
## 5 13061 2138.66806 0.04100367 0.03938859 0.04028566
## 6 13061 4704.95483 0.02371399 0.02316467 0.02357964
## 7 13061 196.41854 0.16475360 0.14144932 0.15005986
## 8 13061 1877.59999 0.04444048 0.04254956 0.04356780
## 9 13061 409.64598 0.10752267 0.09708394 0.10146017
## 10 13061 2612.48044 0.03609653 0.03483896 0.03557700
## 11 13061 372.58062 0.11353014 0.10195516 0.10673733
## 12 13061 7245.09913 0.01576783 0.01552306 0.01579471
## 13 13061 754.66134 0.07585220 0.07050429 0.07295788
## 14 13061 533.20459 0.09248926 0.08465919 0.08807334
## 15 13061 376.74105 0.11280896 0.10137316 0.10610600
## 16 13061 414.47126 0.10680422 0.09649784 0.10082629
## 17 13061 147.01994 0.19593171 0.16383186 0.17497928
## 18 13061 971.82821 0.06559754 0.06155939 0.06348474
## 19 13061 821.34879 0.07226120 0.06739141 0.06965407
## 20 13061 52.59706 0.37931862 0.27500435 0.30108471
## 21 13061 259.30219 0.13997501 0.12278779 0.12947635
## 22 13061 510.89118 0.09477298 0.08656861 0.09012357
## 23 13061 799.00516 0.07341261 0.06839179 0.07071499
## 24 13061 174.64278 0.17667112 0.15014486 0.15971299
## 25 13061 729.05128 0.07736488 0.07180935 0.07434522
## 26 13061 58.64385 0.35203325 0.26037322 0.28436999
## 27 13061 229.85920 0.15018457 0.13057432 0.13804172
## 28 13061 62.45911 0.33735317 0.25225436 0.27510055
## 29 13061 152.91755 0.19132988 0.16060193 0.17136913
## 30 13061 136.58228 0.20489319 0.17005092 0.18194282
## 31 13061 87.87524 0.26974225 0.21243859 0.22977139
## 32 13061 1445.53577 0.05201806 0.04944598 0.05075841
## 33 13061 45.68456 0.41877173 0.29516498 0.32412016
## 34 13061 38.75510 0.47180944 0.32056422 0.35310807
lam<-data.frame(lam=est.p$pooled$lambda, param=row.names(est.p$pooled))
ggplot(data=lam,aes(x=param, y=lam))+geom_col()+theme(axis.text.x = element_text(angle = 45, hjust = 1))
## Model comparisons
fit1<-lm(earnweekna~agegrp+covidint+raceeth+male+industry, ds3)
fit2<-lm(wkearn.mean~agegrp+covidint+raceeth+male+industry, ds3)
fit.imp<-lm(earnweekna~agegrp+covidint+raceeth+male+industry, data=dat.imp)
summary(fit1)
##
## Call:
## lm(formula = earnweekna ~ agegrp + covidint + raceeth + male +
## industry, data = ds3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1322.54 -309.14 -5.06 318.16 1268.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 639.197 23.997 26.636 < 2e-16 ***
## agegrp(24,34] 331.444 15.238 21.751 < 2e-16 ***
## agegrp(34,44] 446.261 15.259 29.246 < 2e-16 ***
## agegrp(44,54] 468.015 15.342 30.505 < 2e-16 ***
## agegrp(54,64] 414.567 16.066 25.804 < 2e-16 ***
## covidint2precov_Q42019 -2.620 14.964 -0.175 0.861025
## covidint3precov_Q12020 28.432 16.803 1.692 0.090660 .
## covidint4earlycov_Q22020 23.350 13.791 1.693 0.090458 .
## covidint5midcov_Q32020 24.430 14.355 1.702 0.088816 .
## covidint6latecov_Q42020 14.985 17.561 0.853 0.393498
## raceethNH_blk 41.344 14.868 2.781 0.005434 **
## raceethNH_other 171.691 18.018 9.529 < 2e-16 ***
## raceethNH_wht 197.963 9.718 20.371 < 2e-16 ***
## male1 172.482 9.566 18.031 < 2e-16 ***
## industryconst -228.258 22.361 -10.208 < 2e-16 ***
## industrycrimjust -93.967 31.419 -2.991 0.002790 **
## industrygen_supprt -262.750 28.946 -9.077 < 2e-16 ***
## industryHealth -173.264 20.083 -8.627 < 2e-16 ***
## industrymanuf -149.718 21.311 -7.026 2.28e-12 ***
## industrymedia_ent -186.589 50.212 -3.716 0.000204 ***
## industryperscare -507.927 49.883 -10.182 < 2e-16 ***
## industryrealest -156.656 45.300 -3.458 0.000546 ***
## industryrepair_maint -312.430 36.445 -8.573 < 2e-16 ***
## industryretail -366.937 20.329 -18.050 < 2e-16 ***
## industryrstrnt_svc -499.603 23.021 -21.702 < 2e-16 ***
## industryTeach -180.498 20.417 -8.841 < 2e-16 ***
## industrytrans -214.733 23.995 -8.949 < 2e-16 ***
## industryutl -26.488 37.297 -0.710 0.477597
## industrywhsale -204.364 29.085 -7.026 2.26e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 417.6 on 9472 degrees of freedom
## (3594 observations deleted due to missingness)
## Multiple R-squared: 0.2807, Adjusted R-squared: 0.2786
## F-statistic: 132 on 28 and 9472 DF, p-value: < 2.2e-16
summary(fit2)
##
## Call:
## lm(formula = wkearn.mean ~ agegrp + covidint + raceeth + male +
## industry, data = ds3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1305.93 -299.08 -12.28 309.76 1246.76
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 653.604 23.020 28.393 < 2e-16 ***
## agegrp(24,34] 319.407 14.597 21.881 < 2e-16 ***
## agegrp(34,44] 426.145 14.637 29.114 < 2e-16 ***
## agegrp(44,54] 447.422 14.732 30.372 < 2e-16 ***
## agegrp(54,64] 394.126 15.392 25.606 < 2e-16 ***
## covidint2precov_Q42019 -1.457 14.315 -0.102 0.91891
## covidint3precov_Q12020 29.683 16.090 1.845 0.06508 .
## covidint4earlycov_Q22020 24.313 13.171 1.846 0.06493 .
## covidint5midcov_Q32020 24.561 13.721 1.790 0.07348 .
## covidint6latecov_Q42020 13.093 16.814 0.779 0.43618
## raceethNH_blk 39.354 14.210 2.770 0.00562 **
## raceethNH_other 158.512 17.200 9.216 < 2e-16 ***
## raceethNH_wht 186.290 9.313 20.004 < 2e-16 ***
## male1 162.611 9.145 17.781 < 2e-16 ***
## industryconst -215.901 21.388 -10.094 < 2e-16 ***
## industrycrimjust -86.974 30.222 -2.878 0.00401 **
## industrygen_supprt -246.286 27.820 -8.853 < 2e-16 ***
## industryHealth -164.652 19.210 -8.571 < 2e-16 ***
## industrymanuf -140.034 20.424 -6.856 7.48e-12 ***
## industrymedia_ent -175.485 48.154 -3.644 0.00027 ***
## industryperscare -477.006 47.339 -10.076 < 2e-16 ***
## industryrealest -150.530 43.098 -3.493 0.00048 ***
## industryrepair_maint -290.961 34.691 -8.387 < 2e-16 ***
## industryretail -343.114 19.431 -17.658 < 2e-16 ***
## industryrstrnt_svc -466.436 22.011 -21.191 < 2e-16 ***
## industryTeach -170.019 19.532 -8.705 < 2e-16 ***
## industrytrans -201.859 22.911 -8.811 < 2e-16 ***
## industryutl -28.763 35.433 -0.812 0.41695
## industrywhsale -193.508 27.781 -6.966 3.48e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 410.7 on 10010 degrees of freedom
## (3056 observations deleted due to missingness)
## Multiple R-squared: 0.2648, Adjusted R-squared: 0.2627
## F-statistic: 128.7 on 28 and 10010 DF, p-value: < 2.2e-16
summary(fit.imp)
##
## Call:
## lm(formula = earnweekna ~ agegrp + covidint + raceeth + male +
## industry, data = dat.imp)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1358.88 -311.82 -0.03 324.29 1263.19
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 646.066 20.761 31.120 < 2e-16 ***
## agegrp(24,34] 331.268 13.251 25.000 < 2e-16 ***
## agegrp(34,44] 443.537 13.328 33.278 < 2e-16 ***
## agegrp(44,54] 458.331 13.469 34.028 < 2e-16 ***
## agegrp(54,64] 408.690 14.033 29.123 < 2e-16 ***
## covidint2precov_Q42019 -6.589 12.875 -0.512 0.60883
## covidint3precov_Q12020 37.129 14.351 2.587 0.00969 **
## covidint4earlycov_Q22020 23.079 11.772 1.960 0.04996 *
## covidint5midcov_Q32020 27.617 12.332 2.239 0.02515 *
## covidint6latecov_Q42020 11.667 15.163 0.769 0.44166
## raceethNH_blk 52.571 12.934 4.065 4.84e-05 ***
## raceethNH_other 227.047 15.105 15.032 < 2e-16 ***
## raceethNH_wht 222.939 8.426 26.459 < 2e-16 ***
## male1 188.584 8.248 22.864 < 2e-16 ***
## industryconst -248.590 19.182 -12.959 < 2e-16 ***
## industrycrimjust -113.520 27.258 -4.165 3.14e-05 ***
## industrygen_supprt -279.916 25.154 -11.128 < 2e-16 ***
## industryHealth -187.418 17.177 -10.911 < 2e-16 ***
## industrymanuf -158.328 18.239 -8.681 < 2e-16 ***
## industrymedia_ent -219.692 42.972 -5.112 3.23e-07 ***
## industryperscare -504.037 44.030 -11.447 < 2e-16 ***
## industryrealest -154.706 38.198 -4.050 5.15e-05 ***
## industryrepair_maint -329.208 31.235 -10.540 < 2e-16 ***
## industryretail -390.314 17.377 -22.461 < 2e-16 ***
## industryrstrnt_svc -524.325 19.771 -26.520 < 2e-16 ***
## industryTeach -196.441 17.425 -11.274 < 2e-16 ***
## industrytrans -234.988 20.435 -11.499 < 2e-16 ***
## industryutl -16.533 31.659 -0.522 0.60152
## industrywhsale -236.095 25.097 -9.407 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 422.7 on 13066 degrees of freedom
## Multiple R-squared: 0.2923, Adjusted R-squared: 0.2907
## F-statistic: 192.7 on 28 and 13066 DF, p-value: < 2.2e-16
The results between the mean and multiple imputed models are similar in terms of beta values and signs. However, the imputed model performed better as indicated by the higher adjusted R squared value. The R squared value indicates that 29% of the variation in weekly income is explained by the model, with age group, race/ethnicity, sex and industry all being statistically significant at the >.99 level.
##How do the results compare to the results from the model fit with the data source with missing values? Again, the betas are largely similar, however the adjusted R squared value is greater for the imputed model. The model fit using the missing values slightly outperformed the mean imputed model.