##The oringinal codes from lab

load("~/Desktop/.RData")
attach(acs2017_ny)
## The following object is masked _by_ .GlobalEnv:
## 
##     OWNCOST
attach(dat_NYC)
## The following object is masked _by_ .GlobalEnv:
## 
##     OWNCOST
## The following objects are masked from acs2017_ny:
## 
##     AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
##     Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
##     BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
##     Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
##     COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
##     DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
##     educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
##     female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
##     has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
##     Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
##     Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
##     in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
##     LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
##     MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
##     OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
##     RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
##     TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
##     VETSTATD, white, WKSWORK2, YRSUSA1
detach()
acs2017_ny$LABFORCE <- as.factor(acs2017_ny$LABFORCE)
levels(acs2017_ny$LABFORCE) <- c("NA","Not in LF","in LF")

acs2017_ny$MARST <- as.factor(acs2017_ny$MARST)
levels(acs2017_ny$MARST) <- c("married spouse present","married spouse absent","separated","divorced","widowed","never married")
#cut age in different range
acs2017_ny$age_bands <- cut(acs2017_ny$AGE,breaks=c(0,25,35,45,55,65,100))
table(acs2017_ny$age_bands,acs2017_ny$LABFORCE)
##           
##               NA Not in LF in LF
##   (0,25]   31680     11717 13256
##   (25,35]      0      4271 20523
##   (35,45]      0      4064 18924
##   (45,55]      0      5406 21747
##   (55,65]      0     10563 18106
##   (65,100]     0     28701  5880
pick_use1 <- (acs2017_ny$AGE >25) & (acs2017_ny$AGE <= 55)
dat_use1 <- subset(acs2017_ny, pick_use1)

dat_use1$LABFORCE <- droplevels(dat_use1$LABFORCE) 

Q1: What is the difference between “NA” as label and Not in the Labor Force? Make sure you understand. (Hint, look at ages in each group). NA data are concentrated in age between 0~25, where children and student make up the most. A large proportion of people between 55~100 is out of laborforce, because that’s the age of retirement.

In general it is a good idea to check summary stats before doing fancier models. What fraction of people, say, 55-65, are in the labor force? What about other age ranges? What would you guess are other important predictors? For example,

model_logit1 <- glm(LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + race_oth + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST,
            family = binomial, data = dat_use1)
summary(model_logit1)
## 
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + 
##     race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + 
##     educ_advdeg + MARST, family = binomial, data = dat_use1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6277   0.3476   0.4862   0.6459   1.5245  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 0.6023215  0.2445543   2.463  0.01378 *  
## AGE                         0.0171486  0.0121072   1.416  0.15666    
## I(AGE^2)                   -0.0003149  0.0001471  -2.141  0.03228 *  
## female                     -0.6839386  0.0205171 -33.335  < 2e-16 ***
## AfAm                       -0.1906696  0.0282354  -6.753 1.45e-11 ***
## Asian                      -0.1112229  0.0374503  -2.970  0.00298 ** 
## race_oth                   -0.0781864  0.0332004  -2.355  0.01852 *  
## Hispanic                    0.1653724  0.0313524   5.275 1.33e-07 ***
## educ_hs                     0.8972780  0.0310196  28.926  < 2e-16 ***
## educ_somecoll               1.4531782  0.0350710  41.435  < 2e-16 ***
## educ_college                1.9430903  0.0370924  52.385  < 2e-16 ***
## educ_advdeg                 2.3676171  0.0437358  54.135  < 2e-16 ***
## MARSTmarried spouse absent -0.5222011  0.0517449 -10.092  < 2e-16 ***
## MARSTseparated             -0.1240651  0.0577062  -2.150  0.03156 *  
## MARSTdivorced               0.0619381  0.0375785   1.648  0.09930 .  
## MARSTwidowed               -0.3023247  0.0934446  -3.235  0.00121 ** 
## MARSTnever married         -0.3857612  0.0241093 -16.000  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71408  on 74934  degrees of freedom
## Residual deviance: 64847  on 74918  degrees of freedom
## AIC: 64881
## 
## Number of Fisher Scoring iterations: 5

Except for age and MARSTwidowed, most of the variables are significant.

library(car)
## Loading required package: carData
vif(model_logit1)
##                     GVIF Df GVIF^(1/(2*Df))
## AGE           121.123694  1       11.005621
## I(AGE^2)      120.012948  1       10.955042
## female          1.039784  1        1.019698
## AfAm            1.122975  1        1.059705
## Asian           1.324404  1        1.150828
## race_oth        1.523372  1        1.234250
## Hispanic        1.432950  1        1.197059
## educ_hs         2.343929  1        1.530989
## educ_somecoll   2.020967  1        1.421607
## educ_college    1.916881  1        1.384515
## educ_advdeg     1.575844  1        1.255326
## MARST           1.304143  5        1.026910
library(generalhoslem)
## Loading required package: reshape
## Loading required package: MASS
logitgof(dat_use1$LABFORCE,fitted(model_logit1))
## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  dat_use1$LABFORCE, fitted(model_logit1)
## X-squared = 223.26, df = 8, p-value < 2.2e-16
or<-exp((summary(model_logit1))$coef[,'Estimate'])
or
##                (Intercept)                        AGE 
##                  1.8263537                  1.0172964 
##                   I(AGE^2)                     female 
##                  0.9996851                  0.5046256 
##                       AfAm                      Asian 
##                  0.8264056                  0.8947393 
##                   race_oth                   Hispanic 
##                  0.9247920                  1.1798324 
##                    educ_hs              educ_somecoll 
##                  2.4529172                  4.2766852 
##               educ_college                educ_advdeg 
##                  6.9802890                 10.6719315 
## MARSTmarried spouse absent             MARSTseparated 
##                  0.5932134                  0.8833223 
##              MARSTdivorced               MARSTwidowed 
##                  1.0638965                  0.7390980 
##         MARSTnever married 
##                  0.6799328

The HL test show the probability of people who are in the laborforce under each sector. Female, people who are in the marrige status of married spouse sbsent, widowed, never married are less likely to be participanted in the laborforce.

#Predict
set.seed(11111)
index<-sample(x=2,size=nrow(dat_use1),replace=TRUE,prob=c(0.7,0.3))
train<-dat_use1[index==1,]
test<-dat_use1[index==2,]
dim(dat_use1)
## [1] 74935   110
dim(train)
## [1] 52543   110
dim(test)
## [1] 22392   110
trainmodel<-glm(LABFORCE ~AGE + I(AGE^2) +female + MORTGAGE+ AfAm + Asian + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST,
            family = binomial, data = train)
prob<-predict(object=trainmodel,newdata=test,type="response")# use the trainmodel to predict
pred<-cbind(test,prob)
pred<-transform(pred,predict=ifelse(prob<=0.5,0,1)) #Reclassify the predicted probability values
ta<-table(pred$LABFORCE,pred$predict) #Compare the actual and predicted values of the model
ta
##            
##                 0     1
##   Not in LF   319  3809
##   in LF       264 18000

So 319+18000 results are matched the prediction and 264+3809 results failed.

sum_diag<-sum(diag(ta)) #the sum of the correct predict Numbers
sum<-sum(ta) #the sum of predict Numbers
sum_diag/sum #Prediction accuracy
## [1] 0.8181047
library("pROC")
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc_curve<-roc(test$LABFORCE,prob)
## Setting levels: control = Not in LF, case = in LF
## Setting direction: controls < cases
x<-1-roc_curve$specificities
y<-roc_curve$sensitivities
plot(x=x,y=y,xlim=c(0,1),ylim=c(0,1),xlab="1-specificity",
ylab="sensitivity",main="ROC Curve",type="l",lwd=2)
abline(a=0,b=1,col="grey")
auc<-roc_curve$auc
text(0.5,0.4,paste("AUC",round(auc,digits=2)),col="blue")

The acuracy of the model is 72%.

## adding two more variables into model_logit1 :Citizen and FAMSIZE; also add interaction with female and MARST
Model_2<- glm(LABFORCE ~ AGE + I(AGE^2) + female:MARST + AfAm + Asian + race_oth + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST+CITIZEN+FAMSIZE,
            family = binomial, data = dat_use1)
summary(Model_2)
## 
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female:MARST + AfAm + 
##     Asian + race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + 
##     educ_advdeg + MARST + CITIZEN + FAMSIZE, family = binomial, 
##     data = dat_use1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8996   0.2532   0.4671   0.6659   1.4997  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         1.3499539  0.2482441   5.438 5.39e-08 ***
## AGE                                 0.0095023  0.0122799   0.774 0.439043    
## I(AGE^2)                           -0.0002280  0.0001495  -1.525 0.127318    
## AfAm                               -0.2669098  0.0289949  -9.205  < 2e-16 ***
## Asian                              -0.2274792  0.0414146  -5.493 3.96e-08 ***
## race_oth                           -0.1159289  0.0340375  -3.406 0.000659 ***
## Hispanic                            0.0853312  0.0328280   2.599 0.009340 ** 
## educ_hs                             0.9089528  0.0317168  28.658  < 2e-16 ***
## educ_somecoll                       1.4685030  0.0358681  40.942  < 2e-16 ***
## educ_college                        1.9649967  0.0379162  51.825  < 2e-16 ***
## educ_advdeg                         2.4119685  0.0446396  54.032  < 2e-16 ***
## MARSTmarried spouse absent         -1.4238018  0.0765591 -18.597  < 2e-16 ***
## MARSTseparated                     -1.0219201  0.0955365 -10.697  < 2e-16 ***
## MARSTdivorced                      -0.9399517  0.0615023 -15.283  < 2e-16 ***
## MARSTwidowed                       -1.2520110  0.1970884  -6.353 2.12e-10 ***
## MARSTnever married                 -1.4265495  0.0403360 -35.367  < 2e-16 ***
## CITIZEN                             0.0682176  0.0109444   6.233 4.57e-10 ***
## FAMSIZE                             0.0232156  0.0063390   3.662 0.000250 ***
## female:MARSTmarried spouse present -1.7097487  0.0355509 -48.093  < 2e-16 ***
## female:MARSTmarried spouse absent  -0.2516292  0.0975290  -2.580 0.009879 ** 
## female:MARSTseparated              -0.3165100  0.1138226  -2.781 0.005424 ** 
## female:MARSTdivorced               -0.1252286  0.0691207  -1.812 0.070027 .  
## female:MARSTwidowed                -0.3312103  0.2202654  -1.504 0.132662    
## female:MARSTnever married           0.1439508  0.0330263   4.359 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71408  on 74934  degrees of freedom
## Residual deviance: 63064  on 74911  degrees of freedom
## AIC: 63112
## 
## Number of Fisher Scoring iterations: 5

INTERPRETATION

1.After adding the CITIZEN and FAMSIZE as well as the interation between female and MARST, the AIC decreases compared with the model_logit1.It give more credibility to the model.

2.Citizen and people form a family are more likely to be in the workforce. What this model surprises us is that female who are never married is 14.39% higher to be in the workforce than male.

model_probit1 <- glm(LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + race_oth + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST,
            family = binomial(link = 'probit'), data = dat_use1)
summary(model_probit1)
## 
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + 
##     race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + 
##     educ_advdeg + MARST, family = binomial(link = "probit"), 
##     data = dat_use1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7097   0.3306   0.4885   0.6530   1.4999  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 3.586e-01  1.382e-01   2.595 0.009469 ** 
## AGE                         1.023e-02  6.833e-03   1.497 0.134368    
## I(AGE^2)                   -1.874e-04  8.305e-05  -2.256 0.024066 *  
## female                     -3.976e-01  1.146e-02 -34.689  < 2e-16 ***
## AfAm                       -1.124e-01  1.629e-02  -6.901 5.18e-12 ***
## Asian                      -7.391e-02  2.114e-02  -3.496 0.000472 ***
## race_oth                   -4.363e-02  1.892e-02  -2.306 0.021129 *  
## Hispanic                    8.710e-02  1.779e-02   4.897 9.75e-07 ***
## educ_hs                     5.419e-01  1.887e-02  28.713  < 2e-16 ***
## educ_somecoll               8.580e-01  2.073e-02  41.401  < 2e-16 ***
## educ_college                1.123e+00  2.130e-02  52.708  < 2e-16 ***
## educ_advdeg                 1.339e+00  2.387e-02  56.110  < 2e-16 ***
## MARSTmarried spouse absent -3.041e-01  3.022e-02 -10.065  < 2e-16 ***
## MARSTseparated             -6.878e-02  3.318e-02  -2.073 0.038172 *  
## MARSTdivorced               3.401e-02  2.099e-02   1.621 0.105098    
## MARSTwidowed               -1.821e-01  5.510e-02  -3.304 0.000952 ***
## MARSTnever married         -2.153e-01  1.361e-02 -15.813  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71408  on 74934  degrees of freedom
## Residual deviance: 64816  on 74918  degrees of freedom
## AIC: 64850
## 
## Number of Fisher Scoring iterations: 5

Compared to the model_logic1, the AIC slightly decreases.