library(car)
library(mice)
library(ggplot2)
library(dplyr)
library(readr)
ds1 <- read.csv("C:/Users/drayr/OneDrive/Desktop/DEM Spring 2021/Stats II 7183/Project/data/cps_00012.csv")
#The names in the data are very ugly, so I make them less ugly
nams<-names(ds1)
head(nams, n=10)
##  [1] "YEAR"     "SERIAL"   "MONTH"    "HWTFINL"  "CPSID"    "ASECFLAG"
##  [7] "STATEFIP" "COUNTY"   "FAMINC"   "PERNUM"
#we see some names are lower case, some are upper and some have a little _ in the first position. This is a nightmare.
newnames<-tolower(gsub(pattern = "_",replacement =  "",x =  nams))
names(ds1)<-newnames

summary(ds1)
##       year          serial          month           hwtfinl     
##  Min.   :2019   Min.   :56720   Min.   : 1.000   Min.   : 2336  
##  1st Qu.:2019   1st Qu.:58433   1st Qu.: 3.000   1st Qu.: 3864  
##  Median :2020   Median :59368   Median : 8.000   Median : 4310  
##  Mean   :2020   Mean   :59379   Mean   : 6.962   Mean   : 4557  
##  3rd Qu.:2020   3rd Qu.:60320   3rd Qu.:10.000   3rd Qu.: 4932  
##  Max.   :2021   Max.   :62078   Max.   :12.000   Max.   :21835  
##                                                                 
##      cpsid              asecflag        statefip      county     
##  Min.   :2.018e+13   Min.   :2       Min.   :48   Min.   :    0  
##  1st Qu.:2.019e+13   1st Qu.:2       1st Qu.:48   1st Qu.:    0  
##  Median :2.019e+13   Median :2       Median :48   Median :    0  
##  Mean   :2.019e+13   Mean   :2       Mean   :48   Mean   : 7452  
##  3rd Qu.:2.020e+13   3rd Qu.:2       3rd Qu.:48   3rd Qu.:    0  
##  Max.   :2.021e+13   Max.   :2       Max.   :48   Max.   :48485  
##                      NA's   :73989                               
##      faminc          pernum           wtfinl          cpsidp         
##  Min.   :100.0   Min.   : 1.000   Min.   :    0   Min.   :2.018e+13  
##  1st Qu.:730.0   1st Qu.: 1.000   1st Qu.: 3968   1st Qu.:2.019e+13  
##  Median :830.0   Median : 2.000   Median : 4476   Median :2.019e+13  
##  Mean   :754.5   Mean   : 1.946   Mean   : 4745   Mean   :2.019e+13  
##  3rd Qu.:842.0   3rd Qu.: 2.000   3rd Qu.: 5191   3rd Qu.:2.020e+13  
##  Max.   :843.0   Max.   :14.000   Max.   :21835   Max.   :2.021e+13  
##                                                                      
##       age          sex             race           hispan         empstat    
##  Min.   :16   Min.   :1.000   Min.   :100.0   Min.   :  0.0   Min.   : 1.0  
##  1st Qu.:28   1st Qu.:1.000   1st Qu.:100.0   1st Qu.:  0.0   1st Qu.:10.0  
##  Median :40   Median :2.000   Median :100.0   Median :  0.0   Median :10.0  
##  Mean   :40   Mean   :1.517   Mean   :157.7   Mean   : 66.7   Mean   :17.2  
##  3rd Qu.:52   3rd Qu.:2.000   3rd Qu.:100.0   3rd Qu.:100.0   3rd Qu.:32.0  
##  Max.   :64   Max.   :2.000   Max.   :814.0   Max.   :612.0   Max.   :36.0  
##                                                                             
##       ind          classwkr        durunem2          whyunemp     
##  Min.   :   0   Min.   : 0.00   Min.   : 0.0000   Min.   :0.0000  
##  1st Qu.:   0   1st Qu.: 0.00   1st Qu.: 0.0000   1st Qu.:0.0000  
##  Median :5180   Median :22.00   Median : 0.0000   Median :0.0000  
##  Mean   :4444   Mean   :15.91   Mean   : 0.3218   Mean   :0.1185  
##  3rd Qu.:7860   3rd Qu.:22.00   3rd Qu.: 0.0000   3rd Qu.:0.0000  
##  Max.   :9890   Max.   :29.00   Max.   :16.0000   Max.   :6.0000  
##                                                                   
##     whyptlwk          wnlook          wkstat         empsame     
##  Min.   :  0.00   Min.   :  1.0   Min.   :11.00   Min.   : 1.00  
##  1st Qu.:  0.00   1st Qu.:999.0   1st Qu.:11.00   1st Qu.: 2.00  
##  Median :  0.00   Median :999.0   Median :11.00   Median :99.00  
##  Mean   : 11.78   Mean   :978.2   Mean   :40.32   Mean   :58.89  
##  3rd Qu.:  0.00   3rd Qu.:999.0   3rd Qu.:99.00   3rd Qu.:99.00  
##  Max.   :130.00   Max.   :999.0   Max.   :99.00   Max.   :99.00  
##                                                                  
##     wrkoffer         earnwt         hourwage          paidhour     
##  Min.   : 1.00   Min.   :    0   Min.   :   1.01   Min.   :0.0000  
##  1st Qu.:99.00   1st Qu.:    0   1st Qu.: 999.99   1st Qu.:0.0000  
##  Median :99.00   Median :    0   Median : 999.99   Median :0.0000  
##  Mean   :98.27   Mean   : 4726   Mean   : 917.63   Mean   :0.2397  
##  3rd Qu.:99.00   3rd Qu.:12776   3rd Qu.: 999.99   3rd Qu.:0.0000  
##  Max.   :99.00   Max.   :65538   Max.   : 999.99   Max.   :2.0000  
##                                                                    
##      union           earnweek         uhrsworkorg       eligorg      
##  Min.   :0.0000   Min.   :    0.01   Min.   :  1.0   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 9999.99   1st Qu.:999.0   1st Qu.:0.0000  
##  Median :0.0000   Median : 9999.99   Median :999.0   Median :0.0000  
##  Mean   :0.1667   Mean   : 8604.45   Mean   :944.5   Mean   :0.1558  
##  3rd Qu.:0.0000   3rd Qu.: 9999.99   3rd Qu.:999.0   3rd Qu.:0.0000  
##  Max.   :3.0000   Max.   : 9999.99   Max.   :999.0   Max.   :1.0000  
##                                                                      
##      otpay         covidunaw       covidlook    
##  Min.   : 1.00   Min.   : 1.0    Min.   : 1.00  
##  1st Qu.:99.00   1st Qu.: 1.0    1st Qu.: 1.00  
##  Median :99.00   Median : 1.0    Median :99.00  
##  Mean   :83.76   Mean   : 1.5    Mean   :71.23  
##  3rd Qu.:99.00   3rd Qu.: 1.0    3rd Qu.:99.00  
##  Max.   :99.00   Max.   :99.0    Max.   :99.00  
##                  NA's   :41004   NA's   :41004

Recode Data

library(questionr)
library(dplyr)

ds2<-ds1%>%
  mutate(
          covid = as.factor(if_else(year==2020 & month>=4 | year==2021 & month<3,1,0)),
          covidint = as.factor(if_else(year==2019 & month<=9,"1precov_Q32019",
                                       if_else(year==2019 & month>=10,"2precov_Q42019",
                                       if_else(year==2020 & month<=2,"3precov_Q12020",
                                       if_else(year==2020 & month>=3 & month<=8,"4earlycov_Q22020",
                                       if_else(year==2020 & month>=9 & month<=12,"5midcov_Q32020",
                                       if_else(year==2021 & month>=1 & month<=3,"6latecov_Q42020","NA"))))))
                                       ),
          covid19unaw = Recode(covidunaw, recodes = "1='0'; 2= '1'; else=NA", as.factor=T),

            hhinc   =Recode(faminc,recodes="100:490='1_lt15k';500:600='2_15-25k';710:720='3_25-35k';
                          730:740='4_35-50k';810:830='5_50k-75k';840:843='6_75kplus';999='NA'", as.factor = T),
            agegrp=cut(age, breaks=c(16,24,34,44,54,64)),
          male=Recode(sex,recodes="2='0';1='1';9=NA", as.factor = T),
          raceeth=as.factor(if_else(race==100 & hispan==000,"NH_wht",
                          if_else(race==200 & hispan==000, "NH_blk",
                          if_else(!race%in%c('100','200','999') & hispan==000, "NH_other",
                          if_else(!hispan%in%c('000','901','902'), "Hisp","NA"))))
                          ),

            employst=Recode( empstat,recodes="1='Mil';10='emp';12='emp_nwklwk';20:22='unemp';32='NILF_utw';34='NILF_oth';36='NILF_ret'; else=NA" , as.factor = T),
          unemployed=Recode( empstat,recodes="1:12='0';20:22='1';else=NA"),
            industry=Recode( ind,recodes="770='const';1070:3990='manuf';4070:4590='whsale';4670:5790='retail';6070:6390='trans';570:690='utl';
                             6470:6672='media_ent';6870:6992='bnk_ins';7071:7072='realest';7860:7890='Teach';7970:8290='Health';8670:8690='rstrnt_svc';8770:8891='repair_maint';8970:8990='perscare';9160:9390='gen_supprt';9470='crimjust'; else=NA", as.factor = T ),
            selfemp=Recode( classwkr,recodes="10:14='1';20:29='0';else=NA" ),
            uedurwks    =Recode( durunem2,recodes="0:3='lt4wks';4:7='4-10wks';8:10='11-22wks';11:15='23-52wks';16='gt52wks'; else=NA", as.factor = T),
            whyue   =Recode( whyunemp,recodes="1='templayoff';2='jobloss';3='tempwk_end';4='leftjob';5='re_ent';6='new_ent'; else=NA" , as.factor = T),
            whypt   =Recode( whyptlwk,recodes="60='onlyfindpt';80='ftdown';100:101='health_med';111='persday';120:122='chcare_fam';
                           123='sch';130='oth'; else=NA" , as.factor = T),
            underemp=Recode( wkstat,recodes="12='1'; 13='1'; 21='1';99=NA; else='0'" ),#reconsider recodes
            sameemp=Recode( empsame,recodes="2='1';1='0';else=NA", as.factor=T ),
            offerwrk=Recode( wrkoffer,recodes= "2='0';1='1';else=NA"),
            paidhourly=Recode( paidhour,recodes="2='1';1='0';else=NA"),
            unioncover=Recode( union,recodes="2:3='1';1='0';else=NA"),
            earnerstud=as.factor(x=eligorg),
            ottippay=Recode( otpay,recodes="2='1';1='0';else=NA"),
          earnweekna=recode.na(earnweek,9999.99,as.numeric = T),
          earnyear=earnweek*52
         )
## Recoded 65562 values to NA.
ds3<-ds2%>%
  filter(earnerstud==1 )
    #select(earnwt, earnweekna, hhinc, covidint, agegrp, male, raceeth, industry)

ds3in <- ds3%>%  
  filter(sameemp==1 )
  
ds3out <- ds3%>%    
    filter(sameemp==0 )


set.seed(410)
samp<-sample(1:dim(ds3in)[1], size = 228) #smaller sample from group with same employer
ds3inbind<-ds3in[samp,]


ds4<-rbind(ds3out,ds3inbind)%>%
      select(wtfinl,earnwt, earnweekna, sameemp, covidint,age, agegrp, male, raceeth, industry)
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-33. For overview type 'help("mgcv-package")'.
gam1<-gam(sameemp~earnweekna+age+covidint+male+raceeth+industry,
            data=ds4,
            weights = wtfinl,
            family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(gam1)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## sameemp ~ earnweekna + age + covidint + male + raceeth + industry
## 
## Parametric coefficients:
##                            Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)              -9.065e-01  9.798e-03  -92.518  < 2e-16 ***
## earnweekna                5.855e-04  3.983e-06  147.009  < 2e-16 ***
## age                       2.914e-02  1.433e-04  203.335  < 2e-16 ***
## covidint2precov_Q42019   -4.795e-01  5.739e-03  -83.548  < 2e-16 ***
## covidint3precov_Q12020   -1.842e-01  6.743e-03  -27.313  < 2e-16 ***
## covidint4earlycov_Q22020 -1.380e-01  5.238e-03  -26.352  < 2e-16 ***
## covidint5midcov_Q32020   -3.214e-01  5.843e-03  -55.005  < 2e-16 ***
## covidint6latecov_Q42020   4.376e-01  6.985e-03   62.655  < 2e-16 ***
## male1                     6.515e-02  3.900e-03   16.705  < 2e-16 ***
## raceethNH_blk            -9.483e-01  6.384e-03 -148.544  < 2e-16 ***
## raceethNH_other          -3.121e-01  8.191e-03  -38.104  < 2e-16 ***
## raceethNH_wht            -3.079e-01  3.844e-03  -80.081  < 2e-16 ***
## industryconst            -1.224e+00  9.011e-03 -135.817  < 2e-16 ***
## industrycrimjust         -1.158e+00  1.409e-02  -82.169  < 2e-16 ***
## industrygen_supprt       -6.031e-02  1.085e-02   -5.558 2.72e-08 ***
## industryHealth           -6.360e-01  8.008e-03  -79.422  < 2e-16 ***
## industrymanuf             4.243e-01  8.763e-03   48.421  < 2e-16 ***
## industrymedia_ent        -8.267e-01  1.913e-02  -43.219  < 2e-16 ***
## industryperscare          1.315e+01  8.395e+00    1.566    0.117    
## industryrealest          -9.851e-01  1.208e-02  -81.548  < 2e-16 ***
## industryrepair_maint      1.051e+00  1.427e-02   73.625  < 2e-16 ***
## industryretail           -7.382e-01  7.939e-03  -92.977  < 2e-16 ***
## industryrstrnt_svc       -4.870e-01  8.930e-03  -54.537  < 2e-16 ***
## industryTeach            -2.967e-01  8.446e-03  -35.128  < 2e-16 ***
## industrytrans            -1.578e-01  8.685e-03  -18.165  < 2e-16 ***
## industryutl              -4.088e-01  1.259e-02  -32.485  < 2e-16 ***
## industrywhsale            5.461e-01  1.570e-02   34.782  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## 
## R-sq.(adj) =   0.06   Deviance explained = 9.98%
## UBRE = 5785.9  Scale est. = 1         n = 359
gam2<-gam(sameemp~s(earnweekna)+s(age)+covidint+male+raceeth+industry,
            data=ds4,
            weights = wtfinl,
            family=binomial)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
summary(gam2)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## sameemp ~ s(earnweekna) + s(age) + covidint + male + raceeth + 
##     industry
## 
## Parametric coefficients:
##                            Estimate Std. Error  z value Pr(>|z|)    
## (Intercept)                0.851946   0.008369  101.803   <2e-16 ***
## covidint2precov_Q42019    -0.444908   0.005927  -75.059   <2e-16 ***
## covidint3precov_Q12020    -0.125121   0.006951  -18.001   <2e-16 ***
## covidint4earlycov_Q22020  -0.201661   0.005469  -36.873   <2e-16 ***
## covidint5midcov_Q32020    -0.249959   0.006088  -41.054   <2e-16 ***
## covidint6latecov_Q42020    0.454445   0.007411   61.318   <2e-16 ***
## male1                      0.061196   0.004070   15.037   <2e-16 ***
## raceethNH_blk             -0.951292   0.006705 -141.883   <2e-16 ***
## raceethNH_other           -0.250881   0.008493  -29.538   <2e-16 ***
## raceethNH_wht             -0.328432   0.003976  -82.609   <2e-16 ***
## industryconst             -1.524032   0.009479 -160.779   <2e-16 ***
## industrycrimjust          -1.189358   0.014742  -80.678   <2e-16 ***
## industrygen_supprt        -0.115787   0.011109  -10.423   <2e-16 ***
## industryHealth            -0.668139   0.008365  -79.877   <2e-16 ***
## industrymanuf              0.441623   0.009103   48.516   <2e-16 ***
## industrymedia_ent         -1.142661   0.019787  -57.748   <2e-16 ***
## industryperscare          18.036281 102.268737    0.176     0.86    
## industryrealest           -1.033039   0.012606  -81.948   <2e-16 ***
## industryrepair_maint       0.795128   0.014682   54.155   <2e-16 ***
## industryretail            -0.887832   0.008256 -107.541   <2e-16 ***
## industryrstrnt_svc        -0.515327   0.009392  -54.868   <2e-16 ***
## industryTeach             -0.223744   0.008883  -25.188   <2e-16 ***
## industrytrans             -0.197969   0.008985  -22.033   <2e-16 ***
## industryutl               -0.516301   0.013461  -38.355   <2e-16 ***
## industrywhsale             0.588486   0.017435   33.754   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##                 edf Ref.df Chi.sq p-value    
## s(earnweekna) 8.945  8.999  38514  <2e-16 ***
## s(age)        8.934  8.999  70939  <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.0489   Deviance explained = 12.8%
## UBRE = 5604.8  Scale est. = 1         n = 359

According to the model results, the smooth (non-linear) terms are statistically significant. Covid period effects on staying with the same employer show a negative likelihood compared to the reference period.Further, men are more likely to have the same employer; blacks whites and other race/ethnicity are less likely compared to Hispanics; and all but manufacturing, maintenance, and wholesale industries have lower likelihood than bankers. The personal care industry does not appear to have an effect.

anova( gam1, gam2, test="Chisq")

The Chisquare test (LRT) results display a low p-value for the model with smooth terms, indicating the terms should be included in the model.

plot(gam2)

The plot of the model indicates that there is a positive relationship between keeping the same employer and weekly earnings up to $250. After which the relationship flatens and is less prevalent for higher earnings.The relationship with age oscilates but increases slightly overall until the age of 60 where the likelihood of having the same employer greatly increases for individuals older than 60.