hw6

##The oringinal codes from lab

load("~/Desktop/.RData")
attach(acs2017_ny)

## The following object is masked _by_ .GlobalEnv:
## 
##     OWNCOST

attach(dat_NYC)

## The following object is masked _by_ .GlobalEnv:
## 
##     OWNCOST

## The following objects are masked from acs2017_ny:
## 
##     AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
##     Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
##     BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
##     Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
##     COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
##     DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
##     educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
##     female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
##     has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
##     Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
##     Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
##     in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
##     LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
##     MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
##     OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
##     RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
##     TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
##     VETSTATD, white, WKSWORK2, YRSUSA1

detach()

acs2017_ny$LABFORCE <- as.factor(acs2017_ny$LABFORCE)
levels(acs2017_ny$LABFORCE) <- c("NA","Not in LF","in LF")

acs2017_ny$MARST <- as.factor(acs2017_ny$MARST)
levels(acs2017_ny$MARST) <- c("married spouse present","married spouse absent","separated","divorced","widowed","never married")

#cut age in different range
acs2017_ny$age_bands <- cut(acs2017_ny$AGE,breaks=c(0,25,35,45,55,65,100))
table(acs2017_ny$age_bands,acs2017_ny$LABFORCE)

##           
##               NA Not in LF in LF
##   (0,25]   31680     11717 13256
##   (25,35]      0      4271 20523
##   (35,45]      0      4064 18924
##   (45,55]      0      5406 21747
##   (55,65]      0     10563 18106
##   (65,100]     0     28701  5880

pick_use1 <- (acs2017_ny$AGE >25) & (acs2017_ny$AGE <= 55)
dat_use1 <- subset(acs2017_ny, pick_use1)

dat_use1$LABFORCE <- droplevels(dat_use1$LABFORCE)

Q1: What is the difference between “NA” as label and Not in the Labor Force? Make sure you understand. (Hint, look at ages in each group). NA data are concentrated in age between 0~25, where children and student make up the most. A large proportion of people between 55~100 is out of laborforce, because that’s the age of retirement.

In general it is a good idea to check summary stats before doing fancier models. What fraction of people, say, 55-65, are in the labor force? What about other age ranges? What would you guess are other important predictors? For example,

model_logit1 <- glm(LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + race_oth + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST,
            family = binomial, data = dat_use1)
summary(model_logit1)

## 
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + 
##     race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + 
##     educ_advdeg + MARST, family = binomial, data = dat_use1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6277   0.3476   0.4862   0.6459   1.5245  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 0.6023215  0.2445543   2.463  0.01378 *  
## AGE                         0.0171486  0.0121072   1.416  0.15666    
## I(AGE^2)                   -0.0003149  0.0001471  -2.141  0.03228 *  
## female                     -0.6839386  0.0205171 -33.335  < 2e-16 ***
## AfAm                       -0.1906696  0.0282354  -6.753 1.45e-11 ***
## Asian                      -0.1112229  0.0374503  -2.970  0.00298 ** 
## race_oth                   -0.0781864  0.0332004  -2.355  0.01852 *  
## Hispanic                    0.1653724  0.0313524   5.275 1.33e-07 ***
## educ_hs                     0.8972780  0.0310196  28.926  < 2e-16 ***
## educ_somecoll               1.4531782  0.0350710  41.435  < 2e-16 ***
## educ_college                1.9430903  0.0370924  52.385  < 2e-16 ***
## educ_advdeg                 2.3676171  0.0437358  54.135  < 2e-16 ***
## MARSTmarried spouse absent -0.5222011  0.0517449 -10.092  < 2e-16 ***
## MARSTseparated             -0.1240651  0.0577062  -2.150  0.03156 *  
## MARSTdivorced               0.0619381  0.0375785   1.648  0.09930 .  
## MARSTwidowed               -0.3023247  0.0934446  -3.235  0.00121 ** 
## MARSTnever married         -0.3857612  0.0241093 -16.000  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71408  on 74934  degrees of freedom
## Residual deviance: 64847  on 74918  degrees of freedom
## AIC: 64881
## 
## Number of Fisher Scoring iterations: 5

Except for age and MARSTwidowed, most of the variables are significant.

library(car)

## Loading required package: carData

vif(model_logit1)

##                     GVIF Df GVIF^(1/(2*Df))
## AGE           121.123694  1       11.005621
## I(AGE^2)      120.012948  1       10.955042
## female          1.039784  1        1.019698
## AfAm            1.122975  1        1.059705
## Asian           1.324404  1        1.150828
## race_oth        1.523372  1        1.234250
## Hispanic        1.432950  1        1.197059
## educ_hs         2.343929  1        1.530989
## educ_somecoll   2.020967  1        1.421607
## educ_college    1.916881  1        1.384515
## educ_advdeg     1.575844  1        1.255326
## MARST           1.304143  5        1.026910

library(generalhoslem)

## Loading required package: reshape

## Loading required package: MASS

logitgof(dat_use1$LABFORCE,fitted(model_logit1))

## 
##  Hosmer and Lemeshow test (binary model)
## 
## data:  dat_use1$LABFORCE, fitted(model_logit1)
## X-squared = 223.26, df = 8, p-value < 2.2e-16

or<-exp((summary(model_logit1))$coef[,'Estimate'])
or

##                (Intercept)                        AGE 
##                  1.8263537                  1.0172964 
##                   I(AGE^2)                     female 
##                  0.9996851                  0.5046256 
##                       AfAm                      Asian 
##                  0.8264056                  0.8947393 
##                   race_oth                   Hispanic 
##                  0.9247920                  1.1798324 
##                    educ_hs              educ_somecoll 
##                  2.4529172                  4.2766852 
##               educ_college                educ_advdeg 
##                  6.9802890                 10.6719315 
## MARSTmarried spouse absent             MARSTseparated 
##                  0.5932134                  0.8833223 
##              MARSTdivorced               MARSTwidowed 
##                  1.0638965                  0.7390980 
##         MARSTnever married 
##                  0.6799328

The HL test show the probability of people who are in the laborforce under each sector. Female, people who are in the marrige status of married spouse sbsent, widowed, never married are less likely to be participanted in the laborforce.

#Predict
set.seed(11111)
index<-sample(x=2,size=nrow(dat_use1),replace=TRUE,prob=c(0.7,0.3))
train<-dat_use1[index==1,]
test<-dat_use1[index==2,]
dim(dat_use1)

## [1] 74935   110

dim(train)

## [1] 52543   110

dim(test)

## [1] 22392   110

trainmodel<-glm(LABFORCE ~AGE + I(AGE^2) +female + MORTGAGE+ AfAm + Asian + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST,
            family = binomial, data = train)
prob<-predict(object=trainmodel,newdata=test,type="response")# use the trainmodel to predict
pred<-cbind(test,prob)
pred<-transform(pred,predict=ifelse(prob<=0.5,0,1)) #Reclassify the predicted probability values
ta<-table(pred$LABFORCE,pred$predict) #Compare the actual and predicted values of the model
ta

##            
##                 0     1
##   Not in LF   319  3809
##   in LF       264 18000

So 319+18000 results are matched the prediction and 264+3809 results failed.

sum_diag<-sum(diag(ta)) #the sum of the correct predict Numbers
sum<-sum(ta) #the sum of predict Numbers
sum_diag/sum #Prediction accuracy

## [1] 0.8181047

library("pROC")

## Type 'citation("pROC")' for a citation.

## 
## Attaching package: 'pROC'

## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var

roc_curve<-roc(test$LABFORCE,prob)

## Setting levels: control = Not in LF, case = in LF

## Setting direction: controls < cases

x<-1-roc_curve$specificities
y<-roc_curve$sensitivities
plot(x=x,y=y,xlim=c(0,1),ylim=c(0,1),xlab="1-specificity",
ylab="sensitivity",main="ROC Curve",type="l",lwd=2)
abline(a=0,b=1,col="grey")
auc<-roc_curve$auc
text(0.5,0.4,paste("AUC",round(auc,digits=2)),col="blue")

The acuracy of the model is 72%.

## adding two more variables into model_logit1 :Citizen and FAMSIZE; also add interaction with female and MARST
Model_2<- glm(LABFORCE ~ AGE + I(AGE^2) + female:MARST + AfAm + Asian + race_oth + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST+CITIZEN+FAMSIZE,
            family = binomial, data = dat_use1)
summary(Model_2)

## 
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female:MARST + AfAm + 
##     Asian + race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + 
##     educ_advdeg + MARST + CITIZEN + FAMSIZE, family = binomial, 
##     data = dat_use1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8996   0.2532   0.4671   0.6659   1.4997  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         1.3499539  0.2482441   5.438 5.39e-08 ***
## AGE                                 0.0095023  0.0122799   0.774 0.439043    
## I(AGE^2)                           -0.0002280  0.0001495  -1.525 0.127318    
## AfAm                               -0.2669098  0.0289949  -9.205  < 2e-16 ***
## Asian                              -0.2274792  0.0414146  -5.493 3.96e-08 ***
## race_oth                           -0.1159289  0.0340375  -3.406 0.000659 ***
## Hispanic                            0.0853312  0.0328280   2.599 0.009340 ** 
## educ_hs                             0.9089528  0.0317168  28.658  < 2e-16 ***
## educ_somecoll                       1.4685030  0.0358681  40.942  < 2e-16 ***
## educ_college                        1.9649967  0.0379162  51.825  < 2e-16 ***
## educ_advdeg                         2.4119685  0.0446396  54.032  < 2e-16 ***
## MARSTmarried spouse absent         -1.4238018  0.0765591 -18.597  < 2e-16 ***
## MARSTseparated                     -1.0219201  0.0955365 -10.697  < 2e-16 ***
## MARSTdivorced                      -0.9399517  0.0615023 -15.283  < 2e-16 ***
## MARSTwidowed                       -1.2520110  0.1970884  -6.353 2.12e-10 ***
## MARSTnever married                 -1.4265495  0.0403360 -35.367  < 2e-16 ***
## CITIZEN                             0.0682176  0.0109444   6.233 4.57e-10 ***
## FAMSIZE                             0.0232156  0.0063390   3.662 0.000250 ***
## female:MARSTmarried spouse present -1.7097487  0.0355509 -48.093  < 2e-16 ***
## female:MARSTmarried spouse absent  -0.2516292  0.0975290  -2.580 0.009879 ** 
## female:MARSTseparated              -0.3165100  0.1138226  -2.781 0.005424 ** 
## female:MARSTdivorced               -0.1252286  0.0691207  -1.812 0.070027 .  
## female:MARSTwidowed                -0.3312103  0.2202654  -1.504 0.132662    
## female:MARSTnever married           0.1439508  0.0330263   4.359 1.31e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71408  on 74934  degrees of freedom
## Residual deviance: 63064  on 74911  degrees of freedom
## AIC: 63112
## 
## Number of Fisher Scoring iterations: 5

INTERPRETATION

1.After adding the CITIZEN and FAMSIZE as well as the interation between female and MARST, the AIC decreases compared with the model_logit1.It give more credibility to the model.

2.Citizen and people form a family are more likely to be in the workforce. What this model surprises us is that female who are never married is 14.39% higher to be in the workforce than male.

model_probit1 <- glm(LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + race_oth + Hispanic 
            + educ_hs + educ_somecoll + educ_college + educ_advdeg 
            + MARST,
            family = binomial(link = 'probit'), data = dat_use1)
summary(model_probit1)

## 
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + 
##     race_oth + Hispanic + educ_hs + educ_somecoll + educ_college + 
##     educ_advdeg + MARST, family = binomial(link = "probit"), 
##     data = dat_use1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7097   0.3306   0.4885   0.6530   1.4999  
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 3.586e-01  1.382e-01   2.595 0.009469 ** 
## AGE                         1.023e-02  6.833e-03   1.497 0.134368    
## I(AGE^2)                   -1.874e-04  8.305e-05  -2.256 0.024066 *  
## female                     -3.976e-01  1.146e-02 -34.689  < 2e-16 ***
## AfAm                       -1.124e-01  1.629e-02  -6.901 5.18e-12 ***
## Asian                      -7.391e-02  2.114e-02  -3.496 0.000472 ***
## race_oth                   -4.363e-02  1.892e-02  -2.306 0.021129 *  
## Hispanic                    8.710e-02  1.779e-02   4.897 9.75e-07 ***
## educ_hs                     5.419e-01  1.887e-02  28.713  < 2e-16 ***
## educ_somecoll               8.580e-01  2.073e-02  41.401  < 2e-16 ***
## educ_college                1.123e+00  2.130e-02  52.708  < 2e-16 ***
## educ_advdeg                 1.339e+00  2.387e-02  56.110  < 2e-16 ***
## MARSTmarried spouse absent -3.041e-01  3.022e-02 -10.065  < 2e-16 ***
## MARSTseparated             -6.878e-02  3.318e-02  -2.073 0.038172 *  
## MARSTdivorced               3.401e-02  2.099e-02   1.621 0.105098    
## MARSTwidowed               -1.821e-01  5.510e-02  -3.304 0.000952 ***
## MARSTnever married         -2.153e-01  1.361e-02 -15.813  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 71408  on 74934  degrees of freedom
## Residual deviance: 64816  on 74918  degrees of freedom
## AIC: 64850
## 
## Number of Fisher Scoring iterations: 5

Compared to the model_logic1, the AIC slightly decreases.

hw6

Carol, JOYCE, Mei, Tami

11/9/2020

INTERPRETATION