##The oringinal codes from lab
load("~/Desktop/.RData")
attach(acs2017_ny)
## The following object is masked _by_ .GlobalEnv:
##
## OWNCOST
attach(dat_NYC)
## The following object is masked _by_ .GlobalEnv:
##
## OWNCOST
## The following objects are masked from acs2017_ny:
##
## AfAm, AGE, Amindian, ANCESTR1, ANCESTR1D, ANCESTR2, ANCESTR2D,
## Asian, below_150poverty, below_200poverty, below_povertyline, BPL,
## BPLD, BUILTYR2, CITIZEN, CLASSWKR, CLASSWKRD, Commute_bus,
## Commute_car, Commute_other, Commute_rail, Commute_subway, COSTELEC,
## COSTFUEL, COSTGAS, COSTWATR, DEGFIELD, DEGFIELD2, DEGFIELD2D,
## DEGFIELDD, DEPARTS, EDUC, educ_advdeg, educ_college, educ_hs,
## educ_nohs, educ_somecoll, EDUCD, EMPSTAT, EMPSTATD, FAMSIZE,
## female, foodstamps, FOODSTMP, FTOTINC, FUELHEAT, GQ,
## has_AnyHealthIns, has_PvtHealthIns, HCOVANY, HCOVPRIV, HHINCOME,
## Hisp_Cuban, Hisp_DomR, Hisp_Mex, Hisp_PR, HISPAN, HISPAND,
## Hispanic, in_Bronx, in_Brooklyn, in_Manhattan, in_Nassau, in_NYC,
## in_Queens, in_StatenI, in_Westchester, INCTOT, INCWAGE, IND,
## LABFORCE, LINGISOL, MARST, MIGCOUNTY1, MIGPLAC1, MIGPUMA1,
## MIGRATE1, MIGRATE1D, MORTGAGE, NCHILD, NCHLT5, OCC, OWNCOST,
## OWNERSHP, OWNERSHPD, POVERTY, PUMA, PWPUMA00, RACE, race_oth,
## RACED, RELATE, RELATED, RENT, ROOMS, SCHOOL, SEX, SSMC, TRANTIME,
## TRANWORK, UHRSWORK, UNITSSTR, unmarried, veteran, VETSTAT,
## VETSTATD, white, WKSWORK2, YRSUSA1
detach()
acs2017_ny$LABFORCE <- as.factor(acs2017_ny$LABFORCE)
levels(acs2017_ny$LABFORCE) <- c("NA","Not in LF","in LF")
acs2017_ny$MARST <- as.factor(acs2017_ny$MARST)
levels(acs2017_ny$MARST) <- c("married spouse present","married spouse absent","separated","divorced","widowed","never married")
#cut age in different range
acs2017_ny$age_bands <- cut(acs2017_ny$AGE,breaks=c(0,25,35,45,55,65,100))
table(acs2017_ny$age_bands,acs2017_ny$LABFORCE)
##
## NA Not in LF in LF
## (0,25] 31680 11717 13256
## (25,35] 0 4271 20523
## (35,45] 0 4064 18924
## (45,55] 0 5406 21747
## (55,65] 0 10563 18106
## (65,100] 0 28701 5880
pick_use1 <- (acs2017_ny$AGE >25) & (acs2017_ny$AGE <= 55)
dat_use1 <- subset(acs2017_ny, pick_use1)
dat_use1$LABFORCE <- droplevels(dat_use1$LABFORCE)
Q1: What is the difference between “NA” as label and Not in the Labor Force? Make sure you understand. (Hint, look at ages in each group). NA data are concentrated in age between 0~25, where children and student make up the most. A large proportion of people between 55~100 is out of laborforce, because that’s the age of retirement.
In general it is a good idea to check summary stats before doing fancier models. What fraction of people, say, 55-65, are in the labor force? What about other age ranges? What would you guess are other important predictors? For example,
model_logit1 <- glm(LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + race_oth + Hispanic
+ educ_hs + educ_somecoll + educ_college + educ_advdeg
+ MARST,
family = binomial, data = dat_use1)
summary(model_logit1)
##
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian +
## race_oth + Hispanic + educ_hs + educ_somecoll + educ_college +
## educ_advdeg + MARST, family = binomial, data = dat_use1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6277 0.3476 0.4862 0.6459 1.5245
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6023215 0.2445543 2.463 0.01378 *
## AGE 0.0171486 0.0121072 1.416 0.15666
## I(AGE^2) -0.0003149 0.0001471 -2.141 0.03228 *
## female -0.6839386 0.0205171 -33.335 < 2e-16 ***
## AfAm -0.1906696 0.0282354 -6.753 1.45e-11 ***
## Asian -0.1112229 0.0374503 -2.970 0.00298 **
## race_oth -0.0781864 0.0332004 -2.355 0.01852 *
## Hispanic 0.1653724 0.0313524 5.275 1.33e-07 ***
## educ_hs 0.8972780 0.0310196 28.926 < 2e-16 ***
## educ_somecoll 1.4531782 0.0350710 41.435 < 2e-16 ***
## educ_college 1.9430903 0.0370924 52.385 < 2e-16 ***
## educ_advdeg 2.3676171 0.0437358 54.135 < 2e-16 ***
## MARSTmarried spouse absent -0.5222011 0.0517449 -10.092 < 2e-16 ***
## MARSTseparated -0.1240651 0.0577062 -2.150 0.03156 *
## MARSTdivorced 0.0619381 0.0375785 1.648 0.09930 .
## MARSTwidowed -0.3023247 0.0934446 -3.235 0.00121 **
## MARSTnever married -0.3857612 0.0241093 -16.000 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71408 on 74934 degrees of freedom
## Residual deviance: 64847 on 74918 degrees of freedom
## AIC: 64881
##
## Number of Fisher Scoring iterations: 5
Except for age and MARSTwidowed, most of the variables are significant.
library(car)
## Loading required package: carData
vif(model_logit1)
## GVIF Df GVIF^(1/(2*Df))
## AGE 121.123694 1 11.005621
## I(AGE^2) 120.012948 1 10.955042
## female 1.039784 1 1.019698
## AfAm 1.122975 1 1.059705
## Asian 1.324404 1 1.150828
## race_oth 1.523372 1 1.234250
## Hispanic 1.432950 1 1.197059
## educ_hs 2.343929 1 1.530989
## educ_somecoll 2.020967 1 1.421607
## educ_college 1.916881 1 1.384515
## educ_advdeg 1.575844 1 1.255326
## MARST 1.304143 5 1.026910
library(generalhoslem)
## Loading required package: reshape
## Loading required package: MASS
logitgof(dat_use1$LABFORCE,fitted(model_logit1))
##
## Hosmer and Lemeshow test (binary model)
##
## data: dat_use1$LABFORCE, fitted(model_logit1)
## X-squared = 223.26, df = 8, p-value < 2.2e-16
or<-exp((summary(model_logit1))$coef[,'Estimate'])
or
## (Intercept) AGE
## 1.8263537 1.0172964
## I(AGE^2) female
## 0.9996851 0.5046256
## AfAm Asian
## 0.8264056 0.8947393
## race_oth Hispanic
## 0.9247920 1.1798324
## educ_hs educ_somecoll
## 2.4529172 4.2766852
## educ_college educ_advdeg
## 6.9802890 10.6719315
## MARSTmarried spouse absent MARSTseparated
## 0.5932134 0.8833223
## MARSTdivorced MARSTwidowed
## 1.0638965 0.7390980
## MARSTnever married
## 0.6799328
The HL test show the probability of people who are in the laborforce under each sector. Female, people who are in the marrige status of married spouse sbsent, widowed, never married are less likely to be participanted in the laborforce.
#Predict
set.seed(11111)
index<-sample(x=2,size=nrow(dat_use1),replace=TRUE,prob=c(0.7,0.3))
train<-dat_use1[index==1,]
test<-dat_use1[index==2,]
dim(dat_use1)
## [1] 74935 110
dim(train)
## [1] 52543 110
dim(test)
## [1] 22392 110
trainmodel<-glm(LABFORCE ~AGE + I(AGE^2) +female + MORTGAGE+ AfAm + Asian + Hispanic
+ educ_hs + educ_somecoll + educ_college + educ_advdeg
+ MARST,
family = binomial, data = train)
prob<-predict(object=trainmodel,newdata=test,type="response")# use the trainmodel to predict
pred<-cbind(test,prob)
pred<-transform(pred,predict=ifelse(prob<=0.5,0,1)) #Reclassify the predicted probability values
ta<-table(pred$LABFORCE,pred$predict) #Compare the actual and predicted values of the model
ta
##
## 0 1
## Not in LF 319 3809
## in LF 264 18000
So 319+18000 results are matched the prediction and 264+3809 results failed.
sum_diag<-sum(diag(ta)) #the sum of the correct predict Numbers
sum<-sum(ta) #the sum of predict Numbers
sum_diag/sum #Prediction accuracy
## [1] 0.8181047
library("pROC")
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
roc_curve<-roc(test$LABFORCE,prob)
## Setting levels: control = Not in LF, case = in LF
## Setting direction: controls < cases
x<-1-roc_curve$specificities
y<-roc_curve$sensitivities
plot(x=x,y=y,xlim=c(0,1),ylim=c(0,1),xlab="1-specificity",
ylab="sensitivity",main="ROC Curve",type="l",lwd=2)
abline(a=0,b=1,col="grey")
auc<-roc_curve$auc
text(0.5,0.4,paste("AUC",round(auc,digits=2)),col="blue")
The acuracy of the model is 72%.
## adding two more variables into model_logit1 :Citizen and FAMSIZE; also add interaction with female and MARST
Model_2<- glm(LABFORCE ~ AGE + I(AGE^2) + female:MARST + AfAm + Asian + race_oth + Hispanic
+ educ_hs + educ_somecoll + educ_college + educ_advdeg
+ MARST+CITIZEN+FAMSIZE,
family = binomial, data = dat_use1)
summary(Model_2)
##
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female:MARST + AfAm +
## Asian + race_oth + Hispanic + educ_hs + educ_somecoll + educ_college +
## educ_advdeg + MARST + CITIZEN + FAMSIZE, family = binomial,
## data = dat_use1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8996 0.2532 0.4671 0.6659 1.4997
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.3499539 0.2482441 5.438 5.39e-08 ***
## AGE 0.0095023 0.0122799 0.774 0.439043
## I(AGE^2) -0.0002280 0.0001495 -1.525 0.127318
## AfAm -0.2669098 0.0289949 -9.205 < 2e-16 ***
## Asian -0.2274792 0.0414146 -5.493 3.96e-08 ***
## race_oth -0.1159289 0.0340375 -3.406 0.000659 ***
## Hispanic 0.0853312 0.0328280 2.599 0.009340 **
## educ_hs 0.9089528 0.0317168 28.658 < 2e-16 ***
## educ_somecoll 1.4685030 0.0358681 40.942 < 2e-16 ***
## educ_college 1.9649967 0.0379162 51.825 < 2e-16 ***
## educ_advdeg 2.4119685 0.0446396 54.032 < 2e-16 ***
## MARSTmarried spouse absent -1.4238018 0.0765591 -18.597 < 2e-16 ***
## MARSTseparated -1.0219201 0.0955365 -10.697 < 2e-16 ***
## MARSTdivorced -0.9399517 0.0615023 -15.283 < 2e-16 ***
## MARSTwidowed -1.2520110 0.1970884 -6.353 2.12e-10 ***
## MARSTnever married -1.4265495 0.0403360 -35.367 < 2e-16 ***
## CITIZEN 0.0682176 0.0109444 6.233 4.57e-10 ***
## FAMSIZE 0.0232156 0.0063390 3.662 0.000250 ***
## female:MARSTmarried spouse present -1.7097487 0.0355509 -48.093 < 2e-16 ***
## female:MARSTmarried spouse absent -0.2516292 0.0975290 -2.580 0.009879 **
## female:MARSTseparated -0.3165100 0.1138226 -2.781 0.005424 **
## female:MARSTdivorced -0.1252286 0.0691207 -1.812 0.070027 .
## female:MARSTwidowed -0.3312103 0.2202654 -1.504 0.132662
## female:MARSTnever married 0.1439508 0.0330263 4.359 1.31e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71408 on 74934 degrees of freedom
## Residual deviance: 63064 on 74911 degrees of freedom
## AIC: 63112
##
## Number of Fisher Scoring iterations: 5
1.After adding the CITIZEN and FAMSIZE as well as the interation between female and MARST, the AIC decreases compared with the model_logit1.It give more credibility to the model.
2.Citizen and people form a family are more likely to be in the workforce. What this model surprises us is that female who are never married is 14.39% higher to be in the workforce than male.
model_probit1 <- glm(LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian + race_oth + Hispanic
+ educ_hs + educ_somecoll + educ_college + educ_advdeg
+ MARST,
family = binomial(link = 'probit'), data = dat_use1)
summary(model_probit1)
##
## Call:
## glm(formula = LABFORCE ~ AGE + I(AGE^2) + female + AfAm + Asian +
## race_oth + Hispanic + educ_hs + educ_somecoll + educ_college +
## educ_advdeg + MARST, family = binomial(link = "probit"),
## data = dat_use1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7097 0.3306 0.4885 0.6530 1.4999
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.586e-01 1.382e-01 2.595 0.009469 **
## AGE 1.023e-02 6.833e-03 1.497 0.134368
## I(AGE^2) -1.874e-04 8.305e-05 -2.256 0.024066 *
## female -3.976e-01 1.146e-02 -34.689 < 2e-16 ***
## AfAm -1.124e-01 1.629e-02 -6.901 5.18e-12 ***
## Asian -7.391e-02 2.114e-02 -3.496 0.000472 ***
## race_oth -4.363e-02 1.892e-02 -2.306 0.021129 *
## Hispanic 8.710e-02 1.779e-02 4.897 9.75e-07 ***
## educ_hs 5.419e-01 1.887e-02 28.713 < 2e-16 ***
## educ_somecoll 8.580e-01 2.073e-02 41.401 < 2e-16 ***
## educ_college 1.123e+00 2.130e-02 52.708 < 2e-16 ***
## educ_advdeg 1.339e+00 2.387e-02 56.110 < 2e-16 ***
## MARSTmarried spouse absent -3.041e-01 3.022e-02 -10.065 < 2e-16 ***
## MARSTseparated -6.878e-02 3.318e-02 -2.073 0.038172 *
## MARSTdivorced 3.401e-02 2.099e-02 1.621 0.105098
## MARSTwidowed -1.821e-01 5.510e-02 -3.304 0.000952 ***
## MARSTnever married -2.153e-01 1.361e-02 -15.813 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 71408 on 74934 degrees of freedom
## Residual deviance: 64816 on 74918 degrees of freedom
## AIC: 64850
##
## Number of Fisher Scoring iterations: 5
Compared to the model_logic1, the AIC slightly decreases.